mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-05-08 08:18:51 +00:00
Compare commits
4 Commits
ca7e008d78
...
2ca782a65a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2ca782a65a | ||
|
|
d51f35bf63 | ||
|
|
0b65927b1b | ||
|
|
831b321c6a |
@ -19,7 +19,6 @@
|
|||||||
#include "common/media_io.h"
|
#include "common/media_io.h"
|
||||||
#include "common/resource_owners.hpp"
|
#include "common/resource_owners.hpp"
|
||||||
#include "image_metadata.h"
|
#include "image_metadata.h"
|
||||||
#include "llm.hpp"
|
|
||||||
|
|
||||||
namespace fs = std::filesystem;
|
namespace fs = std::filesystem;
|
||||||
|
|
||||||
@ -501,15 +500,6 @@ int main(int argc, const char* argv[]) {
|
|||||||
SDContextParams ctx_params;
|
SDContextParams ctx_params;
|
||||||
SDGenerationParams gen_params;
|
SDGenerationParams gen_params;
|
||||||
|
|
||||||
cli_params.verbose = true;
|
|
||||||
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
|
||||||
GemmaTokenizer tokenizer;
|
|
||||||
auto tokens = tokenizer.tokenize("<html> 一只可爱的小猫");
|
|
||||||
for (auto token : tokens) {
|
|
||||||
LOG_INFO("%d", token);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
parse_args(argc, argv, cli_params, ctx_params, gen_params);
|
parse_args(argc, argv, cli_params, ctx_params, gen_params);
|
||||||
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
||||||
log_verbose = cli_params.verbose;
|
log_verbose = cli_params.verbose;
|
||||||
|
|||||||
@ -340,6 +340,10 @@ ArgOptions SDContextParams::get_options() {
|
|||||||
"--high-noise-diffusion-model",
|
"--high-noise-diffusion-model",
|
||||||
"path to the standalone high noise diffusion model",
|
"path to the standalone high noise diffusion model",
|
||||||
&high_noise_diffusion_model_path},
|
&high_noise_diffusion_model_path},
|
||||||
|
{"",
|
||||||
|
"--embeddings-connectors",
|
||||||
|
"path to LTXAV embeddings connectors",
|
||||||
|
&embeddings_connectors_path},
|
||||||
{"",
|
{"",
|
||||||
"--vae",
|
"--vae",
|
||||||
"path to standalone vae model",
|
"path to standalone vae model",
|
||||||
@ -656,6 +660,7 @@ std::string SDContextParams::to_string() const {
|
|||||||
<< " llm_vision_path: \"" << llm_vision_path << "\",\n"
|
<< " llm_vision_path: \"" << llm_vision_path << "\",\n"
|
||||||
<< " diffusion_model_path: \"" << diffusion_model_path << "\",\n"
|
<< " diffusion_model_path: \"" << diffusion_model_path << "\",\n"
|
||||||
<< " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
|
<< " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
|
||||||
|
<< " embeddings_connectors_path: \"" << embeddings_connectors_path << "\",\n"
|
||||||
<< " vae_path: \"" << vae_path << "\",\n"
|
<< " vae_path: \"" << vae_path << "\",\n"
|
||||||
<< " taesd_path: \"" << taesd_path << "\",\n"
|
<< " taesd_path: \"" << taesd_path << "\",\n"
|
||||||
<< " esrgan_path: \"" << esrgan_path << "\",\n"
|
<< " esrgan_path: \"" << esrgan_path << "\",\n"
|
||||||
@ -712,6 +717,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
|
|||||||
llm_vision_path.c_str(),
|
llm_vision_path.c_str(),
|
||||||
diffusion_model_path.c_str(),
|
diffusion_model_path.c_str(),
|
||||||
high_noise_diffusion_model_path.c_str(),
|
high_noise_diffusion_model_path.c_str(),
|
||||||
|
embeddings_connectors_path.c_str(),
|
||||||
vae_path.c_str(),
|
vae_path.c_str(),
|
||||||
taesd_path.c_str(),
|
taesd_path.c_str(),
|
||||||
control_net_path.c_str(),
|
control_net_path.c_str(),
|
||||||
@ -2180,6 +2186,7 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
|
|||||||
params.strength = strength;
|
params.strength = strength;
|
||||||
params.seed = seed;
|
params.seed = seed;
|
||||||
params.video_frames = video_frames;
|
params.video_frames = video_frames;
|
||||||
|
params.fps = fps;
|
||||||
params.vace_strength = vace_strength;
|
params.vace_strength = vace_strength;
|
||||||
params.vae_tiling_params = vae_tiling_params;
|
params.vae_tiling_params = vae_tiling_params;
|
||||||
params.cache = cache_params;
|
params.cache = cache_params;
|
||||||
|
|||||||
@ -92,6 +92,7 @@ struct SDContextParams {
|
|||||||
std::string llm_vision_path;
|
std::string llm_vision_path;
|
||||||
std::string diffusion_model_path;
|
std::string diffusion_model_path;
|
||||||
std::string high_noise_diffusion_model_path;
|
std::string high_noise_diffusion_model_path;
|
||||||
|
std::string embeddings_connectors_path;
|
||||||
std::string vae_path;
|
std::string vae_path;
|
||||||
std::string taesd_path;
|
std::string taesd_path;
|
||||||
std::string esrgan_path;
|
std::string esrgan_path;
|
||||||
|
|||||||
@ -171,6 +171,7 @@ typedef struct {
|
|||||||
const char* llm_vision_path;
|
const char* llm_vision_path;
|
||||||
const char* diffusion_model_path;
|
const char* diffusion_model_path;
|
||||||
const char* high_noise_diffusion_model_path;
|
const char* high_noise_diffusion_model_path;
|
||||||
|
const char* embeddings_connectors_path;
|
||||||
const char* vae_path;
|
const char* vae_path;
|
||||||
const char* taesd_path;
|
const char* taesd_path;
|
||||||
const char* control_net_path;
|
const char* control_net_path;
|
||||||
@ -359,6 +360,7 @@ typedef struct {
|
|||||||
float strength;
|
float strength;
|
||||||
int64_t seed;
|
int64_t seed;
|
||||||
int video_frames;
|
int video_frames;
|
||||||
|
int fps;
|
||||||
float vace_strength;
|
float vace_strength;
|
||||||
sd_tiling_params_t vae_tiling_params;
|
sd_tiling_params_t vae_tiling_params;
|
||||||
sd_cache_params_t cache;
|
sd_cache_params_t cache;
|
||||||
|
|||||||
@ -103,6 +103,64 @@ namespace DiT {
|
|||||||
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
|
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline ggml_tensor* patchify(ggml_context* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
int pt,
|
||||||
|
int ph,
|
||||||
|
int pw,
|
||||||
|
int64_t N = 1) {
|
||||||
|
// x: [N*C, T, H, W]
|
||||||
|
// return: [N, h*w, C*pt*ph*pw]
|
||||||
|
int64_t C = x->ne[3] / N;
|
||||||
|
int64_t T = x->ne[2];
|
||||||
|
int64_t H = x->ne[1];
|
||||||
|
int64_t W = x->ne[0];
|
||||||
|
int64_t t_len = T / pt;
|
||||||
|
int64_t h_len = H / ph;
|
||||||
|
int64_t w_len = W / pw;
|
||||||
|
|
||||||
|
GGML_ASSERT(C * N == x->ne[3]);
|
||||||
|
GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
|
||||||
|
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N); // [N*C*t_len, pt, h_len*ph, w_len*pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, h_len*ph, pt, w_len*pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N); // [N*C*t_len*h_len, ph, pt, w_len*pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt, ph, w_len*pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt*ph, w_len, pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, w_len, pt*ph, pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N); // [N, C, t_len*h_len*w_len, pt*ph*pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N, t_len*h_len*w_len, C, pt*ph*pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1); // [N, t_len*h_len*w_len, C*pt*ph*pw]
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline ggml_tensor* unpatchify(ggml_context* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
int64_t t_len,
|
||||||
|
int64_t h_len,
|
||||||
|
int64_t w_len,
|
||||||
|
int pt,
|
||||||
|
int ph,
|
||||||
|
int pw) {
|
||||||
|
// x: [N, t_len*h_len*w_len, pt*ph*pw*C]
|
||||||
|
// return: [N*C, t_len*pt, h_len*ph, w_len*pw]
|
||||||
|
int64_t N = x->ne[3];
|
||||||
|
int64_t C = x->ne[0] / pt / ph / pw;
|
||||||
|
|
||||||
|
GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
|
||||||
|
|
||||||
|
x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N); // [N, t_len*h_len*w_len, pt*ph*pw, C]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, t_len*h_len*w_len, pt*ph*pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N); // [N*C*t_len*h_len, w_len, pt*ph, pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt*ph, w_len, pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt, ph, w_len*pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, ph, pt, w_len*pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N); // [N*C*t_len, h_len*ph, pt, w_len*pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, pt, h_len*ph, w_len*pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N); // [N*C, t_len*pt, h_len*ph, w_len*pw]
|
||||||
|
return x;
|
||||||
|
}
|
||||||
} // namespace DiT
|
} // namespace DiT
|
||||||
|
|
||||||
#endif // __COMMON_DIT_HPP__
|
#endif // __COMMON_DIT_HPP__
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
#ifndef __CONDITIONER_HPP__
|
#ifndef __CONDITIONER_HPP__
|
||||||
#define __CONDITIONER_HPP__
|
#define __CONDITIONER_HPP__
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <limits>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
|
|
||||||
#include "clip.hpp"
|
#include "clip.hpp"
|
||||||
@ -46,6 +48,17 @@ static inline sd::Tensor<float> apply_token_weights(sd::Tensor<float> hidden_sta
|
|||||||
return hidden_states;
|
return hidden_states;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool all_one = true;
|
||||||
|
for (float weight : weights) {
|
||||||
|
if (weight != 1.0f) {
|
||||||
|
all_one = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (all_one) {
|
||||||
|
return hidden_states;
|
||||||
|
}
|
||||||
|
|
||||||
if (hidden_states.dim() == 1) {
|
if (hidden_states.dim() == 1) {
|
||||||
hidden_states.unsqueeze_(1);
|
hidden_states.unsqueeze_(1);
|
||||||
}
|
}
|
||||||
@ -57,7 +70,7 @@ static inline sd::Tensor<float> apply_token_weights(sd::Tensor<float> hidden_sta
|
|||||||
chunk_weights.reshape_({1, static_cast<int64_t>(weights.size())});
|
chunk_weights.reshape_({1, static_cast<int64_t>(weights.size())});
|
||||||
hidden_states *= chunk_weights;
|
hidden_states *= chunk_weights;
|
||||||
float new_mean = hidden_states.mean();
|
float new_mean = hidden_states.mean();
|
||||||
if (new_mean != 0.0f) {
|
if (std::isfinite(original_mean) && std::isfinite(new_mean) && new_mean != 0.0f) {
|
||||||
hidden_states *= (original_mean / new_mean);
|
hidden_states *= (original_mean / new_mean);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1958,4 +1971,277 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct LTXAVTextProjection : public GGMLBlock {
|
||||||
|
static constexpr int64_t kHiddenSize = 3840;
|
||||||
|
static constexpr int64_t kNumStates = 49;
|
||||||
|
bool dual_projection = false;
|
||||||
|
|
||||||
|
LTXAVTextProjection(bool dual_projection = false)
|
||||||
|
: dual_projection(dual_projection) {
|
||||||
|
if (dual_projection) {
|
||||||
|
blocks["video_aggregate_embed"] = std::make_shared<Linear>(kHiddenSize * kNumStates, 4096, true);
|
||||||
|
blocks["audio_aggregate_embed"] = std::make_shared<Linear>(kHiddenSize * kNumStates, 2048, true);
|
||||||
|
} else {
|
||||||
|
blocks["projection"] = std::make_shared<Linear>(kHiddenSize * kNumStates, kHiddenSize, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
||||||
|
if (!dual_projection) {
|
||||||
|
auto projection = std::dynamic_pointer_cast<Linear>(blocks["projection"]);
|
||||||
|
return projection->forward(ctx, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto video_projection = std::dynamic_pointer_cast<Linear>(blocks["video_aggregate_embed"]);
|
||||||
|
auto audio_projection = std::dynamic_pointer_cast<Linear>(blocks["audio_aggregate_embed"]);
|
||||||
|
auto video_in = ggml_ext_scale(ctx->ggml_ctx, x, std::sqrt(4096.f / static_cast<float>(kHiddenSize)));
|
||||||
|
auto audio_in = ggml_ext_scale(ctx->ggml_ctx, x, std::sqrt(2048.f / static_cast<float>(kHiddenSize)));
|
||||||
|
auto video = video_projection->forward(ctx, video_in);
|
||||||
|
auto audio = audio_projection->forward(ctx, audio_in);
|
||||||
|
return ggml_concat(ctx->ggml_ctx, video, audio, 0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct LTXAVTextProjectionRunner : public GGMLRunner {
|
||||||
|
LTXAVTextProjection model;
|
||||||
|
|
||||||
|
LTXAVTextProjectionRunner(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
|
const std::string& prefix = "")
|
||||||
|
: GGMLRunner(backend, offload_params_to_cpu),
|
||||||
|
model(tensor_storage_map.find(prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end()) {
|
||||||
|
model.init(params_ctx, tensor_storage_map, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_desc() override {
|
||||||
|
return "ltxav_text_projection";
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) {
|
||||||
|
model.get_param_tensors(tensors, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
|
||||||
|
ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
||||||
|
auto x = make_input(x_tensor);
|
||||||
|
auto runner_ctx = get_context();
|
||||||
|
auto out = model.forward(&runner_ctx, x);
|
||||||
|
ggml_build_forward_expand(gf, out);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> compute(int n_threads, const sd::Tensor<float>& x) {
|
||||||
|
auto get_graph = [&]() -> ggml_cgraph* {
|
||||||
|
return build_graph(x);
|
||||||
|
};
|
||||||
|
return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct LTXAVEmbedder : public Conditioner {
|
||||||
|
static constexpr int64_t kHiddenSize = 3840;
|
||||||
|
static constexpr int64_t kNumStates = 49;
|
||||||
|
static constexpr int64_t kMinLength = 1024;
|
||||||
|
|
||||||
|
std::shared_ptr<GemmaTokenizer> tokenizer;
|
||||||
|
std::shared_ptr<LLM::LLMRunner> llm;
|
||||||
|
std::shared_ptr<LTXAVTextProjectionRunner> projector;
|
||||||
|
bool dual_projection = false;
|
||||||
|
|
||||||
|
LTXAVEmbedder(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
|
const std::string& llm_prefix = "text_encoders.llm",
|
||||||
|
const std::string& projector_prefix = "text_embedding_projection") {
|
||||||
|
tokenizer = std::make_shared<GemmaTokenizer>();
|
||||||
|
llm = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::GEMMA3_12B,
|
||||||
|
backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
|
tensor_storage_map,
|
||||||
|
llm_prefix,
|
||||||
|
false);
|
||||||
|
dual_projection = tensor_storage_map.find(projector_prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end();
|
||||||
|
projector = std::make_shared<LTXAVTextProjectionRunner>(backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
|
tensor_storage_map,
|
||||||
|
projector_prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||||
|
llm->get_param_tensors(tensors, "text_encoders.llm");
|
||||||
|
projector->get_param_tensors(tensors, "text_embedding_projection");
|
||||||
|
}
|
||||||
|
|
||||||
|
void alloc_params_buffer() override {
|
||||||
|
llm->alloc_params_buffer();
|
||||||
|
projector->alloc_params_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_params_buffer() override {
|
||||||
|
llm->free_params_buffer();
|
||||||
|
projector->free_params_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_buffer_size() override {
|
||||||
|
return llm->get_params_buffer_size() + projector->get_params_buffer_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_flash_attention_enabled(bool enabled) override {
|
||||||
|
llm->set_flash_attention_enabled(enabled);
|
||||||
|
projector->set_flash_attention_enabled(enabled);
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||||
|
llm->set_weight_adapter(adapter);
|
||||||
|
projector->set_weight_adapter(adapter);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
|
||||||
|
const std::pair<int, int>& attn_range) {
|
||||||
|
std::vector<std::pair<std::string, float>> parsed_attention;
|
||||||
|
if (attn_range.first >= 0 && attn_range.second > 0) {
|
||||||
|
if (attn_range.first > 0) {
|
||||||
|
parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
|
||||||
|
}
|
||||||
|
if (attn_range.second - attn_range.first > 0) {
|
||||||
|
auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
|
||||||
|
parsed_attention.insert(parsed_attention.end(), new_parsed_attention.begin(), new_parsed_attention.end());
|
||||||
|
}
|
||||||
|
if (static_cast<size_t>(attn_range.second) < text.size()) {
|
||||||
|
parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
parsed_attention.emplace_back(text, 1.f);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int> tokens;
|
||||||
|
std::vector<float> weights;
|
||||||
|
for (const auto& item : parsed_attention) {
|
||||||
|
auto curr_tokens = tokenizer->encode(item.first, nullptr);
|
||||||
|
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
|
||||||
|
weights.insert(weights.end(), curr_tokens.size(), item.second);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> mask;
|
||||||
|
tokenizer->pad_tokens(tokens, &weights, &mask, kMinLength);
|
||||||
|
return {tokens, weights, mask};
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> encode_prompt(int n_threads,
|
||||||
|
const std::string& prompt,
|
||||||
|
const std::pair<int, int>& prompt_attn_range) {
|
||||||
|
auto tokens_weights_mask = tokenize(prompt, prompt_attn_range);
|
||||||
|
auto& tokens = std::get<0>(tokens_weights_mask);
|
||||||
|
auto& weights = std::get<1>(tokens_weights_mask);
|
||||||
|
auto& mask = std::get<2>(tokens_weights_mask);
|
||||||
|
|
||||||
|
sd::Tensor<int32_t> input_ids({static_cast<int64_t>(tokens.size())}, std::vector<int32_t>(tokens.begin(), tokens.end()));
|
||||||
|
sd::Tensor<float> attention_mask;
|
||||||
|
if (!mask.empty()) {
|
||||||
|
const float mask_min = std::numeric_limits<float>::lowest() / 4.0f;
|
||||||
|
attention_mask = sd::Tensor<float>({static_cast<int64_t>(mask.size()), static_cast<int64_t>(mask.size())});
|
||||||
|
for (size_t i1 = 0; i1 < mask.size(); ++i1) {
|
||||||
|
for (size_t i0 = 0; i0 < mask.size(); ++i0) {
|
||||||
|
float value = 0.0f;
|
||||||
|
if (mask[i0] == 0.0f) {
|
||||||
|
value += mask_min;
|
||||||
|
}
|
||||||
|
if (i0 > i1) {
|
||||||
|
value += mask_min;
|
||||||
|
}
|
||||||
|
attention_mask[static_cast<int64_t>(i0 + mask.size() * i1)] = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto hidden_states = llm->compute(n_threads,
|
||||||
|
input_ids,
|
||||||
|
attention_mask,
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
true);
|
||||||
|
GGML_ASSERT(!hidden_states.empty());
|
||||||
|
hidden_states = apply_token_weights(std::move(hidden_states), weights);
|
||||||
|
|
||||||
|
int64_t valid_tokens = 0;
|
||||||
|
for (float value : mask) {
|
||||||
|
valid_tokens += static_cast<int64_t>(value > 0.0f);
|
||||||
|
}
|
||||||
|
GGML_ASSERT(valid_tokens > 0);
|
||||||
|
|
||||||
|
hidden_states = sd::ops::slice(hidden_states,
|
||||||
|
1,
|
||||||
|
hidden_states.shape()[1] - valid_tokens,
|
||||||
|
hidden_states.shape()[1]);
|
||||||
|
hidden_states.reshape_({kHiddenSize, kNumStates, valid_tokens});
|
||||||
|
hidden_states = hidden_states.permute({1, 0, 2});
|
||||||
|
|
||||||
|
if (dual_projection) {
|
||||||
|
for (int64_t state_idx = 0; state_idx < kNumStates; ++state_idx) {
|
||||||
|
for (int64_t token_idx = 0; token_idx < valid_tokens; ++token_idx) {
|
||||||
|
double sq_sum = 0.0;
|
||||||
|
for (int64_t hidden_idx = 0; hidden_idx < kHiddenSize; ++hidden_idx) {
|
||||||
|
float value = hidden_states.index(state_idx, hidden_idx, token_idx);
|
||||||
|
sq_sum += static_cast<double>(value) * static_cast<double>(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
float inv_rms = 1.0f / std::sqrt(static_cast<float>(sq_sum / static_cast<double>(kHiddenSize)) + 1e-6f);
|
||||||
|
for (int64_t hidden_idx = 0; hidden_idx < kHiddenSize; ++hidden_idx) {
|
||||||
|
hidden_states.index(state_idx, hidden_idx, token_idx) *= inv_rms;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int64_t state_idx = 0; state_idx < kNumStates; ++state_idx) {
|
||||||
|
double sum = 0.0;
|
||||||
|
float min_value = std::numeric_limits<float>::infinity();
|
||||||
|
float max_value = -std::numeric_limits<float>::infinity();
|
||||||
|
for (int64_t token_idx = 0; token_idx < valid_tokens; ++token_idx) {
|
||||||
|
for (int64_t hidden_idx = 0; hidden_idx < kHiddenSize; ++hidden_idx) {
|
||||||
|
float value = hidden_states.index(state_idx, hidden_idx, token_idx);
|
||||||
|
sum += value;
|
||||||
|
min_value = std::min(min_value, value);
|
||||||
|
max_value = std::max(max_value, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float mean_value = static_cast<float>(sum / static_cast<double>(kHiddenSize * valid_tokens));
|
||||||
|
float denom = max_value - min_value + 1e-6f;
|
||||||
|
float scale_value = 8.0f / denom;
|
||||||
|
for (int64_t token_idx = 0; token_idx < valid_tokens; ++token_idx) {
|
||||||
|
for (int64_t hidden_idx = 0; hidden_idx < kHiddenSize; ++hidden_idx) {
|
||||||
|
float value = hidden_states.index(state_idx, hidden_idx, token_idx);
|
||||||
|
hidden_states.index(state_idx, hidden_idx, token_idx) = (value - mean_value) * scale_value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
hidden_states.reshape_({kNumStates * kHiddenSize, valid_tokens});
|
||||||
|
return projector->compute(n_threads, hidden_states);
|
||||||
|
}
|
||||||
|
|
||||||
|
SDCondition get_learned_condition(int n_threads,
|
||||||
|
const ConditionerParams& conditioner_params) override {
|
||||||
|
int64_t t0 = ggml_time_ms();
|
||||||
|
|
||||||
|
std::string prompt;
|
||||||
|
std::pair<int, int> prompt_attn_range;
|
||||||
|
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||||
|
prompt += conditioner_params.text;
|
||||||
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
|
||||||
|
auto hidden_states = encode_prompt(n_threads, prompt, prompt_attn_range);
|
||||||
|
GGML_ASSERT(!hidden_states.empty());
|
||||||
|
|
||||||
|
int64_t t1 = ggml_time_ms();
|
||||||
|
LOG_DEBUG("computing LTXAV condition graph completed, taking %" PRId64 " ms", t1 - t0);
|
||||||
|
|
||||||
|
SDCondition result;
|
||||||
|
result.c_crossattn = std::move(hidden_states);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -5,6 +5,7 @@
|
|||||||
#include "anima.hpp"
|
#include "anima.hpp"
|
||||||
#include "ernie_image.hpp"
|
#include "ernie_image.hpp"
|
||||||
#include "flux.hpp"
|
#include "flux.hpp"
|
||||||
|
#include "ltxv.hpp"
|
||||||
#include "mmdit.hpp"
|
#include "mmdit.hpp"
|
||||||
#include "qwen_image.hpp"
|
#include "qwen_image.hpp"
|
||||||
#include "tensor_ggml.hpp"
|
#include "tensor_ggml.hpp"
|
||||||
@ -14,7 +15,9 @@
|
|||||||
|
|
||||||
struct DiffusionParams {
|
struct DiffusionParams {
|
||||||
const sd::Tensor<float>* x = nullptr;
|
const sd::Tensor<float>* x = nullptr;
|
||||||
|
const sd::Tensor<float>* audio_x = nullptr;
|
||||||
const sd::Tensor<float>* timesteps = nullptr;
|
const sd::Tensor<float>* timesteps = nullptr;
|
||||||
|
const sd::Tensor<float>* audio_timesteps = nullptr;
|
||||||
const sd::Tensor<float>* context = nullptr;
|
const sd::Tensor<float>* context = nullptr;
|
||||||
const sd::Tensor<float>* c_concat = nullptr;
|
const sd::Tensor<float>* c_concat = nullptr;
|
||||||
const sd::Tensor<float>* y = nullptr;
|
const sd::Tensor<float>* y = nullptr;
|
||||||
@ -28,6 +31,7 @@ struct DiffusionParams {
|
|||||||
float control_strength = 0.f;
|
float control_strength = 0.f;
|
||||||
const sd::Tensor<float>* vace_context = nullptr;
|
const sd::Tensor<float>* vace_context = nullptr;
|
||||||
float vace_strength = 1.f;
|
float vace_strength = 1.f;
|
||||||
|
int audio_length = 0;
|
||||||
const std::vector<int>* skip_layers = nullptr;
|
const std::vector<int>* skip_layers = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -579,4 +583,69 @@ struct ErnieImageModel : public DiffusionModel {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct LTXAVModel : public DiffusionModel {
|
||||||
|
std::string prefix;
|
||||||
|
LTXV::LTXAVRunner ltxav;
|
||||||
|
|
||||||
|
LTXAVModel(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
|
const std::string prefix = "model.diffusion_model")
|
||||||
|
: prefix(prefix), ltxav(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_desc() override {
|
||||||
|
return ltxav.get_desc();
|
||||||
|
}
|
||||||
|
|
||||||
|
void alloc_params_buffer() override {
|
||||||
|
ltxav.alloc_params_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_params_buffer() override {
|
||||||
|
ltxav.free_params_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_compute_buffer() override {
|
||||||
|
ltxav.free_compute_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||||
|
ltxav.get_param_tensors(tensors, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_buffer_size() override {
|
||||||
|
return ltxav.get_params_buffer_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||||
|
ltxav.set_weight_adapter(adapter);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_adm_in_channels() override {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_flash_attention_enabled(bool enabled) override {
|
||||||
|
ltxav.set_flash_attention_enabled(enabled);
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||||
|
ltxav.set_circular_axes(circular_x, circular_y);
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> compute(int n_threads,
|
||||||
|
const DiffusionParams& diffusion_params) override {
|
||||||
|
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||||
|
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||||
|
return ltxav.compute(n_threads,
|
||||||
|
*diffusion_params.x,
|
||||||
|
*diffusion_params.timesteps,
|
||||||
|
tensor_or_empty(diffusion_params.context),
|
||||||
|
tensor_or_empty(diffusion_params.audio_x),
|
||||||
|
tensor_or_empty(diffusion_params.audio_timesteps),
|
||||||
|
diffusion_params.audio_length);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -1675,13 +1675,22 @@ struct WeightAdapter {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct GGMLRunnerContext {
|
struct GGMLRunnerContext {
|
||||||
ggml_backend_t backend = nullptr;
|
ggml_backend_t backend = nullptr;
|
||||||
ggml_context* ggml_ctx = nullptr;
|
ggml_context* ggml_ctx = nullptr;
|
||||||
bool flash_attn_enabled = false;
|
bool flash_attn_enabled = false;
|
||||||
bool conv2d_direct_enabled = false;
|
bool conv2d_direct_enabled = false;
|
||||||
bool circular_x_enabled = false;
|
bool circular_x_enabled = false;
|
||||||
bool circular_y_enabled = false;
|
bool circular_y_enabled = false;
|
||||||
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
|
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
|
||||||
|
std::unordered_map<ggml_tensor*, std::string>* debug_tensors = nullptr;
|
||||||
|
|
||||||
|
void capture_tensor(const std::string& name, ggml_tensor* tensor) {
|
||||||
|
if (debug_tensors == nullptr || tensor == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ggml_set_output(tensor);
|
||||||
|
(*debug_tensors)[tensor] = name;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct GGMLRunner {
|
struct GGMLRunner {
|
||||||
@ -1713,6 +1722,7 @@ protected:
|
|||||||
|
|
||||||
std::map<ggml_tensor*, const void*> backend_tensor_data_map;
|
std::map<ggml_tensor*, const void*> backend_tensor_data_map;
|
||||||
std::map<std::string, ggml_tensor*> cache_tensor_map; // name -> tensor
|
std::map<std::string, ggml_tensor*> cache_tensor_map; // name -> tensor
|
||||||
|
std::unordered_map<ggml_tensor*, std::string> debug_tensors;
|
||||||
const std::string final_result_name = "ggml_runner_final_result_tensor";
|
const std::string final_result_name = "ggml_runner_final_result_tensor";
|
||||||
|
|
||||||
bool flash_attn_enabled = false;
|
bool flash_attn_enabled = false;
|
||||||
@ -1799,6 +1809,7 @@ protected:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void free_compute_ctx() {
|
void free_compute_ctx() {
|
||||||
|
debug_tensors.clear();
|
||||||
if (compute_ctx != nullptr) {
|
if (compute_ctx != nullptr) {
|
||||||
ggml_free(compute_ctx);
|
ggml_free(compute_ctx);
|
||||||
compute_ctx = nullptr;
|
compute_ctx = nullptr;
|
||||||
@ -1834,6 +1845,11 @@ protected:
|
|||||||
auto result = ggml_graph_node(gf, -1);
|
auto result = ggml_graph_node(gf, -1);
|
||||||
ggml_set_name(result, final_result_name.c_str());
|
ggml_set_name(result, final_result_name.c_str());
|
||||||
}
|
}
|
||||||
|
for (const auto& entry : debug_tensors) {
|
||||||
|
if (entry.first != nullptr) {
|
||||||
|
ggml_build_forward_expand(gf, entry.first);
|
||||||
|
}
|
||||||
|
}
|
||||||
prepare_build_in_tensor_after(gf);
|
prepare_build_in_tensor_after(gf);
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
@ -1903,6 +1919,21 @@ protected:
|
|||||||
for (auto& kv : backend_tensor_data_map) {
|
for (auto& kv : backend_tensor_data_map) {
|
||||||
auto tensor = kv.first;
|
auto tensor = kv.first;
|
||||||
auto data = kv.second;
|
auto data = kv.second;
|
||||||
|
if (tensor == nullptr || data == nullptr) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const char* name = ggml_get_name(tensor);
|
||||||
|
if (tensor->buffer == nullptr) {
|
||||||
|
LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
|
||||||
|
get_desc().c_str(),
|
||||||
|
name != nullptr ? name : "",
|
||||||
|
(long long)tensor->ne[0],
|
||||||
|
(long long)tensor->ne[1],
|
||||||
|
(long long)tensor->ne[2],
|
||||||
|
(long long)tensor->ne[3],
|
||||||
|
ggml_type_name(tensor->type));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
|
ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
|
||||||
}
|
}
|
||||||
@ -2025,6 +2056,7 @@ public:
|
|||||||
runner_ctx.circular_x_enabled = circular_x_enabled;
|
runner_ctx.circular_x_enabled = circular_x_enabled;
|
||||||
runner_ctx.circular_y_enabled = circular_y_enabled;
|
runner_ctx.circular_y_enabled = circular_y_enabled;
|
||||||
runner_ctx.weight_adapter = weight_adapter;
|
runner_ctx.weight_adapter = weight_adapter;
|
||||||
|
runner_ctx.debug_tensors = &debug_tensors;
|
||||||
return runner_ctx;
|
return runner_ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2163,6 +2195,21 @@ public:
|
|||||||
LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
|
LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
|
||||||
return std::nullopt;
|
return std::nullopt;
|
||||||
}
|
}
|
||||||
|
for (const auto& entry : debug_tensors) {
|
||||||
|
auto tensor = entry.first;
|
||||||
|
if (tensor == nullptr) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (tensor->type != GGML_TYPE_F32) {
|
||||||
|
LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s",
|
||||||
|
get_desc().c_str(),
|
||||||
|
entry.second.c_str(),
|
||||||
|
ggml_type_name(tensor->type));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto debug_tensor = sd::make_sd_tensor_from_ggml<float>(tensor);
|
||||||
|
print_sd_tensor(debug_tensor, false, entry.second.c_str());
|
||||||
|
}
|
||||||
copy_cache_tensors_to_cache_buffer();
|
copy_cache_tensors_to_cache_buffer();
|
||||||
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
|
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
|
||||||
std::optional<sd::Tensor<T>> output;
|
std::optional<sd::Tensor<T>> output;
|
||||||
|
|||||||
356
src/llm.hpp
356
src/llm.hpp
@ -2,8 +2,10 @@
|
|||||||
#define __LLM_HPP__
|
#define __LLM_HPP__
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <limits>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
@ -30,6 +32,7 @@ namespace LLM {
|
|||||||
QWEN3,
|
QWEN3,
|
||||||
MISTRAL_SMALL_3_2,
|
MISTRAL_SMALL_3_2,
|
||||||
MINISTRAL_3_3B,
|
MINISTRAL_3_3B,
|
||||||
|
GEMMA3_12B,
|
||||||
ARCH_COUNT,
|
ARCH_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -38,6 +41,12 @@ namespace LLM {
|
|||||||
"qwen3",
|
"qwen3",
|
||||||
"mistral_small3.2",
|
"mistral_small3.2",
|
||||||
"ministral3.3b",
|
"ministral3.3b",
|
||||||
|
"gemma3_12b",
|
||||||
|
};
|
||||||
|
|
||||||
|
enum class MLPActivation {
|
||||||
|
SILU,
|
||||||
|
GELU_TANH,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LLMVisionParams {
|
struct LLMVisionParams {
|
||||||
@ -55,23 +64,71 @@ namespace LLM {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct LLMParams {
|
struct LLMParams {
|
||||||
LLMArch arch = LLMArch::QWEN2_5_VL;
|
LLMArch arch = LLMArch::QWEN2_5_VL;
|
||||||
int64_t num_layers = 28;
|
int64_t num_layers = 28;
|
||||||
int64_t hidden_size = 3584;
|
int64_t hidden_size = 3584;
|
||||||
int64_t intermediate_size = 18944;
|
int64_t intermediate_size = 18944;
|
||||||
int num_heads = 28;
|
int num_heads = 28;
|
||||||
int num_kv_heads = 4;
|
int num_kv_heads = 4;
|
||||||
int head_dim = 128;
|
int head_dim = 128;
|
||||||
bool qkv_bias = true;
|
bool qkv_bias = true;
|
||||||
bool qk_norm = false;
|
bool qk_norm = false;
|
||||||
int64_t vocab_size = 152064;
|
bool rms_norm_add = false;
|
||||||
float rms_norm_eps = 1e-06f;
|
bool normalize_input = false;
|
||||||
|
int64_t vocab_size = 152064;
|
||||||
|
int64_t max_position_embeddings = 128000;
|
||||||
|
float rms_norm_eps = 1e-06f;
|
||||||
|
MLPActivation mlp_activation = MLPActivation::SILU;
|
||||||
|
std::vector<float> rope_thetas = {1000000.f};
|
||||||
|
std::vector<float> rope_scales = {1.f};
|
||||||
|
std::vector<int> sliding_attention;
|
||||||
LLMVisionParams vision;
|
LLMVisionParams vision;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct MLP : public GGMLBlock {
|
struct LLMRMSNorm : public UnaryBlock {
|
||||||
|
protected:
|
||||||
|
int64_t hidden_size;
|
||||||
|
float eps;
|
||||||
|
bool add_unit_offset;
|
||||||
|
std::string prefix;
|
||||||
|
|
||||||
|
void init_params(ggml_context* ctx,
|
||||||
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
|
std::string prefix = "") override {
|
||||||
|
this->prefix = prefix;
|
||||||
|
params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
MLP(int64_t hidden_size, int64_t intermediate_size, bool bias = false) {
|
LLMRMSNorm(int64_t hidden_size,
|
||||||
|
float eps = 1e-06f,
|
||||||
|
bool add_unit_offset = false)
|
||||||
|
: hidden_size(hidden_size), eps(eps), add_unit_offset(add_unit_offset) {}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
||||||
|
ggml_tensor* w = params["weight"];
|
||||||
|
if (ctx->weight_adapter) {
|
||||||
|
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
|
||||||
|
}
|
||||||
|
x = ggml_rms_norm(ctx->ggml_ctx, x, eps);
|
||||||
|
auto scaled = ggml_mul(ctx->ggml_ctx, x, w);
|
||||||
|
if (add_unit_offset) {
|
||||||
|
scaled = ggml_add_inplace(ctx->ggml_ctx, scaled, x);
|
||||||
|
}
|
||||||
|
return scaled;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct MLP : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
MLPActivation activation;
|
||||||
|
|
||||||
|
public:
|
||||||
|
MLP(int64_t hidden_size,
|
||||||
|
int64_t intermediate_size,
|
||||||
|
bool bias = false,
|
||||||
|
MLPActivation activation_ = MLPActivation::SILU)
|
||||||
|
: activation(activation_) {
|
||||||
blocks["gate_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, intermediate_size, bias));
|
blocks["gate_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, intermediate_size, bias));
|
||||||
blocks["up_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, intermediate_size, bias));
|
blocks["up_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, intermediate_size, bias));
|
||||||
blocks["down_proj"] = std::shared_ptr<GGMLBlock>(new Linear(intermediate_size, hidden_size, bias));
|
blocks["down_proj"] = std::shared_ptr<GGMLBlock>(new Linear(intermediate_size, hidden_size, bias));
|
||||||
@ -84,9 +141,13 @@ namespace LLM {
|
|||||||
auto down_proj = std::dynamic_pointer_cast<Linear>(blocks["down_proj"]);
|
auto down_proj = std::dynamic_pointer_cast<Linear>(blocks["down_proj"]);
|
||||||
|
|
||||||
auto h = gate_proj->forward(ctx, x);
|
auto h = gate_proj->forward(ctx, x);
|
||||||
h = ggml_silu_inplace(ctx->ggml_ctx, h);
|
if (activation == MLPActivation::GELU_TANH) {
|
||||||
h = ggml_mul_inplace(ctx->ggml_ctx, h, up_proj->forward(ctx, x));
|
h = ggml_ext_gelu(ctx->ggml_ctx, h, true);
|
||||||
h = down_proj->forward(ctx, h);
|
} else {
|
||||||
|
h = ggml_silu_inplace(ctx->ggml_ctx, h);
|
||||||
|
}
|
||||||
|
h = ggml_mul_inplace(ctx->ggml_ctx, h, up_proj->forward(ctx, x));
|
||||||
|
h = down_proj->forward(ctx, h);
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -377,24 +438,35 @@ namespace LLM {
|
|||||||
int64_t num_heads;
|
int64_t num_heads;
|
||||||
int64_t num_kv_heads;
|
int64_t num_kv_heads;
|
||||||
bool qk_norm;
|
bool qk_norm;
|
||||||
|
int64_t max_position_embeddings;
|
||||||
|
std::vector<float> rope_thetas;
|
||||||
|
std::vector<float> rope_scales;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Attention(const LLMParams& params)
|
Attention(const LLMParams& params)
|
||||||
: arch(params.arch), num_heads(params.num_heads), num_kv_heads(params.num_kv_heads), head_dim(params.head_dim), qk_norm(params.qk_norm) {
|
: arch(params.arch),
|
||||||
|
num_heads(params.num_heads),
|
||||||
|
num_kv_heads(params.num_kv_heads),
|
||||||
|
head_dim(params.head_dim),
|
||||||
|
qk_norm(params.qk_norm),
|
||||||
|
max_position_embeddings(params.max_position_embeddings),
|
||||||
|
rope_thetas(params.rope_thetas),
|
||||||
|
rope_scales(params.rope_scales) {
|
||||||
blocks["q_proj"] = std::make_shared<Linear>(params.hidden_size, num_heads * head_dim, params.qkv_bias);
|
blocks["q_proj"] = std::make_shared<Linear>(params.hidden_size, num_heads * head_dim, params.qkv_bias);
|
||||||
blocks["k_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
|
blocks["k_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
|
||||||
blocks["v_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
|
blocks["v_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
|
||||||
blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, false);
|
blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, false);
|
||||||
if (params.qk_norm) {
|
if (params.qk_norm) {
|
||||||
blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim, params.rms_norm_eps);
|
blocks["q_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
|
||||||
blocks["k_norm"] = std::make_shared<RMSNorm>(head_dim, params.rms_norm_eps);
|
blocks["k_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
ggml_tensor* x,
|
ggml_tensor* x,
|
||||||
ggml_tensor* input_pos,
|
ggml_tensor* input_pos,
|
||||||
ggml_tensor* attention_mask = nullptr) {
|
ggml_tensor* attention_mask = nullptr,
|
||||||
|
int rope_index = 0) {
|
||||||
// x: [N, n_token, hidden_size]
|
// x: [N, n_token, hidden_size]
|
||||||
int64_t n_token = x->ne[1];
|
int64_t n_token = x->ne[1];
|
||||||
int64_t N = x->ne[2];
|
int64_t N = x->ne[2];
|
||||||
@ -412,8 +484,8 @@ namespace LLM {
|
|||||||
v = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_kv_heads, n_token, N); // [N, n_token, num_kv_heads, head_dim]
|
v = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_kv_heads, n_token, N); // [N, n_token, num_kv_heads, head_dim]
|
||||||
|
|
||||||
if (qk_norm) {
|
if (qk_norm) {
|
||||||
auto q_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
|
auto q_norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["q_norm"]);
|
||||||
auto k_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
|
auto k_norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["k_norm"]);
|
||||||
|
|
||||||
q = q_norm->forward(ctx, q);
|
q = q_norm->forward(ctx, q);
|
||||||
k = k_norm->forward(ctx, k);
|
k = k_norm->forward(ctx, k);
|
||||||
@ -428,6 +500,36 @@ namespace LLM {
|
|||||||
} else if (arch == LLMArch::QWEN3) {
|
} else if (arch == LLMArch::QWEN3) {
|
||||||
q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
|
q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
|
||||||
k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
|
k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
|
||||||
|
} else if (arch == LLMArch::GEMMA3_12B) {
|
||||||
|
float rope_theta = (rope_index == 1 ? 10000.0f : 1000000.0f);
|
||||||
|
float rope_scale = (rope_index == 1 ? 1.f : 8.f);
|
||||||
|
float freq_scale = 1.f / rope_scale;
|
||||||
|
q = ggml_rope_ext(ctx->ggml_ctx,
|
||||||
|
q,
|
||||||
|
input_pos,
|
||||||
|
nullptr,
|
||||||
|
head_dim,
|
||||||
|
GGML_ROPE_TYPE_NORMAL,
|
||||||
|
0,
|
||||||
|
rope_theta,
|
||||||
|
freq_scale,
|
||||||
|
0.f,
|
||||||
|
1.f,
|
||||||
|
32.f,
|
||||||
|
1.f);
|
||||||
|
k = ggml_rope_ext(ctx->ggml_ctx,
|
||||||
|
k,
|
||||||
|
input_pos,
|
||||||
|
nullptr,
|
||||||
|
head_dim,
|
||||||
|
GGML_ROPE_TYPE_NORMAL,
|
||||||
|
0,
|
||||||
|
rope_theta,
|
||||||
|
freq_scale,
|
||||||
|
0.f,
|
||||||
|
1.f,
|
||||||
|
32.f,
|
||||||
|
1.f);
|
||||||
} else {
|
} else {
|
||||||
int sections[4] = {16, 24, 24, 0};
|
int sections[4] = {16, 24, 24, 0};
|
||||||
q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
|
q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
|
||||||
@ -448,33 +550,76 @@ namespace LLM {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct TransformerBlock : public GGMLBlock {
|
struct TransformerBlock : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
LLMArch arch;
|
||||||
|
int sliding_attention;
|
||||||
|
bool has_post_attention_norm;
|
||||||
|
bool has_post_ffw_norm;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
TransformerBlock(const LLMParams& params) {
|
TransformerBlock(const LLMParams& params, int layer_index)
|
||||||
|
: arch(params.arch),
|
||||||
|
sliding_attention(0),
|
||||||
|
has_post_attention_norm(params.arch == LLMArch::GEMMA3_12B),
|
||||||
|
has_post_ffw_norm(params.arch == LLMArch::GEMMA3_12B) {
|
||||||
blocks["self_attn"] = std::make_shared<Attention>(params);
|
blocks["self_attn"] = std::make_shared<Attention>(params);
|
||||||
blocks["mlp"] = std::make_shared<MLP>(params.hidden_size, params.intermediate_size);
|
blocks["mlp"] = std::make_shared<MLP>(params.hidden_size,
|
||||||
blocks["input_layernorm"] = std::make_shared<RMSNorm>(params.hidden_size, params.rms_norm_eps);
|
params.intermediate_size,
|
||||||
blocks["post_attention_layernorm"] = std::make_shared<RMSNorm>(params.hidden_size, params.rms_norm_eps);
|
false,
|
||||||
|
params.mlp_activation);
|
||||||
|
blocks["input_layernorm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
|
||||||
|
blocks["post_attention_layernorm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
|
||||||
|
if (has_post_attention_norm) {
|
||||||
|
blocks["post_attention_norm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
|
||||||
|
}
|
||||||
|
if (has_post_ffw_norm) {
|
||||||
|
blocks["post_ffw_norm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
|
||||||
|
}
|
||||||
|
if (!params.sliding_attention.empty()) {
|
||||||
|
sliding_attention = params.sliding_attention[layer_index % params.sliding_attention.size()];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
ggml_tensor* x,
|
ggml_tensor* x,
|
||||||
ggml_tensor* input_pos,
|
ggml_tensor* input_pos,
|
||||||
ggml_tensor* attention_mask = nullptr) {
|
ggml_tensor* attention_mask = nullptr,
|
||||||
|
ggml_tensor* sliding_attention_mask = nullptr) {
|
||||||
// x: [N, n_token, hidden_size]
|
// x: [N, n_token, hidden_size]
|
||||||
auto self_attn = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
|
auto self_attn = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
|
||||||
auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
|
auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
|
||||||
auto input_layernorm = std::dynamic_pointer_cast<RMSNorm>(blocks["input_layernorm"]);
|
auto input_layernorm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["input_layernorm"]);
|
||||||
auto post_attention_layernorm = std::dynamic_pointer_cast<RMSNorm>(blocks["post_attention_layernorm"]);
|
auto post_attention_layernorm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["post_attention_layernorm"]);
|
||||||
|
std::shared_ptr<LLMRMSNorm> post_attention_norm = nullptr;
|
||||||
|
std::shared_ptr<LLMRMSNorm> post_ffw_norm = nullptr;
|
||||||
|
if (has_post_attention_norm) {
|
||||||
|
post_attention_norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["post_attention_norm"]);
|
||||||
|
}
|
||||||
|
if (has_post_ffw_norm) {
|
||||||
|
post_ffw_norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["post_ffw_norm"]);
|
||||||
|
}
|
||||||
|
ggml_tensor* block_attention_mask = attention_mask;
|
||||||
|
int rope_index = 0;
|
||||||
|
if (arch == LLMArch::GEMMA3_12B && sliding_attention > 0) {
|
||||||
|
block_attention_mask = sliding_attention_mask;
|
||||||
|
rope_index = 1;
|
||||||
|
}
|
||||||
|
|
||||||
auto residual = x;
|
auto residual = x;
|
||||||
x = input_layernorm->forward(ctx, x);
|
x = input_layernorm->forward(ctx, x);
|
||||||
x = self_attn->forward(ctx, x, input_pos, attention_mask);
|
x = self_attn->forward(ctx, x, input_pos, block_attention_mask, rope_index);
|
||||||
x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
|
if (post_attention_norm != nullptr) {
|
||||||
|
x = post_attention_norm->forward(ctx, x);
|
||||||
|
}
|
||||||
|
x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
|
||||||
|
|
||||||
residual = x;
|
residual = x;
|
||||||
x = post_attention_layernorm->forward(ctx, x);
|
x = post_attention_layernorm->forward(ctx, x);
|
||||||
x = mlp->forward(ctx, x);
|
x = mlp->forward(ctx, x);
|
||||||
x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
|
if (post_ffw_norm != nullptr) {
|
||||||
|
x = post_ffw_norm->forward(ctx, x);
|
||||||
|
}
|
||||||
|
x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
|
||||||
|
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
@ -483,28 +628,36 @@ namespace LLM {
|
|||||||
struct TextModel : public GGMLBlock {
|
struct TextModel : public GGMLBlock {
|
||||||
protected:
|
protected:
|
||||||
int64_t num_layers;
|
int64_t num_layers;
|
||||||
|
int64_t hidden_size;
|
||||||
|
bool normalize_input;
|
||||||
|
float input_scale;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
TextModel(const LLMParams& params)
|
TextModel(const LLMParams& params)
|
||||||
: num_layers(params.num_layers) {
|
: num_layers(params.num_layers),
|
||||||
|
hidden_size(params.hidden_size),
|
||||||
|
normalize_input(params.normalize_input),
|
||||||
|
input_scale(std::sqrt(static_cast<float>(params.hidden_size))) {
|
||||||
blocks["embed_tokens"] = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size, params.hidden_size));
|
blocks["embed_tokens"] = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size, params.hidden_size));
|
||||||
for (int i = 0; i < num_layers; i++) {
|
for (int i = 0; i < num_layers; i++) {
|
||||||
blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new TransformerBlock(params));
|
blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new TransformerBlock(params, i));
|
||||||
}
|
}
|
||||||
blocks["norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(params.hidden_size, params.rms_norm_eps));
|
blocks["norm"] = std::shared_ptr<GGMLBlock>(new LLMRMSNorm(params.hidden_size, params.rms_norm_eps, params.rms_norm_add));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
ggml_tensor* input_ids,
|
ggml_tensor* input_ids,
|
||||||
ggml_tensor* input_pos,
|
ggml_tensor* input_pos,
|
||||||
ggml_tensor* attention_mask,
|
ggml_tensor* attention_mask,
|
||||||
|
ggml_tensor* sliding_attention_mask,
|
||||||
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
||||||
std::set<int> out_layers) {
|
std::set<int> out_layers,
|
||||||
|
bool return_all_hidden_states = false) {
|
||||||
// input_ids: [N, n_token]
|
// input_ids: [N, n_token]
|
||||||
// return: [N, n_token, hidden_size]
|
// return: [N, n_token, hidden_size]
|
||||||
|
|
||||||
auto embed_tokens = std::dynamic_pointer_cast<Embedding>(blocks["embed_tokens"]);
|
auto embed_tokens = std::dynamic_pointer_cast<Embedding>(blocks["embed_tokens"]);
|
||||||
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
|
auto norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["norm"]);
|
||||||
|
|
||||||
auto x = embed_tokens->forward(ctx, input_ids);
|
auto x = embed_tokens->forward(ctx, input_ids);
|
||||||
|
|
||||||
@ -549,22 +702,44 @@ namespace LLM {
|
|||||||
x = input_embed;
|
x = input_embed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (normalize_input) {
|
||||||
|
x = ggml_ext_scale(ctx->ggml_ctx, x, input_scale, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (return_all_hidden_states) {
|
||||||
|
intermediate_outputs.push_back(x);
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < num_layers; i++) {
|
for (int i = 0; i < num_layers; i++) {
|
||||||
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]);
|
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]);
|
||||||
|
|
||||||
x = block->forward(ctx, x, input_pos, attention_mask);
|
x = block->forward(ctx, x, input_pos, attention_mask, sliding_attention_mask);
|
||||||
if (out_layers.find(i + 1) != out_layers.end()) {
|
if (return_all_hidden_states) {
|
||||||
|
if (i + 1 < num_layers) {
|
||||||
|
intermediate_outputs.push_back(x);
|
||||||
|
}
|
||||||
|
} else if (out_layers.find(i + 1) != out_layers.end()) {
|
||||||
intermediate_outputs.push_back(x);
|
intermediate_outputs.push_back(x);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!intermediate_outputs.empty()) {
|
auto normed_x = norm->forward(ctx, x);
|
||||||
|
if (return_all_hidden_states) {
|
||||||
|
intermediate_outputs.push_back(normed_x);
|
||||||
|
x = intermediate_outputs[0];
|
||||||
|
for (int i = 1; i < intermediate_outputs.size(); i++) {
|
||||||
|
x = ggml_concat(ctx->ggml_ctx, x, intermediate_outputs[i], 0);
|
||||||
|
}
|
||||||
|
} else if (!intermediate_outputs.empty()) {
|
||||||
|
if (out_layers.find(static_cast<int>(num_layers + 1)) != out_layers.end()) {
|
||||||
|
intermediate_outputs.push_back(normed_x);
|
||||||
|
}
|
||||||
x = intermediate_outputs[0];
|
x = intermediate_outputs[0];
|
||||||
for (int i = 1; i < intermediate_outputs.size(); i++) {
|
for (int i = 1; i < intermediate_outputs.size(); i++) {
|
||||||
x = ggml_concat(ctx->ggml_ctx, x, intermediate_outputs[i], 0);
|
x = ggml_concat(ctx->ggml_ctx, x, intermediate_outputs[i], 0);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
x = norm->forward(ctx, x);
|
x = normed_x;
|
||||||
}
|
}
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
@ -599,12 +774,21 @@ namespace LLM {
|
|||||||
ggml_tensor* input_ids,
|
ggml_tensor* input_ids,
|
||||||
ggml_tensor* input_pos,
|
ggml_tensor* input_pos,
|
||||||
ggml_tensor* attention_mask,
|
ggml_tensor* attention_mask,
|
||||||
|
ggml_tensor* sliding_attention_mask,
|
||||||
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
||||||
std::set<int> out_layers) {
|
std::set<int> out_layers,
|
||||||
|
bool return_all_hidden_states = false) {
|
||||||
// input_ids: [N, n_token]
|
// input_ids: [N, n_token]
|
||||||
auto model = std::dynamic_pointer_cast<TextModel>(blocks["model"]);
|
auto model = std::dynamic_pointer_cast<TextModel>(blocks["model"]);
|
||||||
|
|
||||||
auto x = model->forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
|
auto x = model->forward(ctx,
|
||||||
|
input_ids,
|
||||||
|
input_pos,
|
||||||
|
attention_mask,
|
||||||
|
sliding_attention_mask,
|
||||||
|
image_embeds,
|
||||||
|
out_layers,
|
||||||
|
return_all_hidden_states);
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -627,6 +811,7 @@ namespace LLM {
|
|||||||
|
|
||||||
std::vector<int> input_pos_vec;
|
std::vector<int> input_pos_vec;
|
||||||
std::vector<float> attention_mask_vec;
|
std::vector<float> attention_mask_vec;
|
||||||
|
std::vector<float> sliding_attention_mask_vec;
|
||||||
std::vector<float> window_mask_vec;
|
std::vector<float> window_mask_vec;
|
||||||
std::vector<int> window_index_vec;
|
std::vector<int> window_index_vec;
|
||||||
std::vector<int> window_inverse_index_vec;
|
std::vector<int> window_inverse_index_vec;
|
||||||
@ -653,6 +838,23 @@ namespace LLM {
|
|||||||
params.qkv_bias = false;
|
params.qkv_bias = false;
|
||||||
params.qk_norm = true;
|
params.qk_norm = true;
|
||||||
params.rms_norm_eps = 1e-6f;
|
params.rms_norm_eps = 1e-6f;
|
||||||
|
} else if (arch == LLMArch::GEMMA3_12B) {
|
||||||
|
params.head_dim = 256;
|
||||||
|
params.num_heads = 16;
|
||||||
|
params.num_kv_heads = 8;
|
||||||
|
params.qkv_bias = false;
|
||||||
|
params.qk_norm = true;
|
||||||
|
params.rms_norm_eps = 1e-6f;
|
||||||
|
// llama.cpp adds +1 to Gemma3 norm.weight when exporting GGUF, so GGUF loading
|
||||||
|
// must keep rms_norm_add disabled here or the offset gets applied twice.
|
||||||
|
// Convenient for the converter, less convenient for whoever gets to debug it later.
|
||||||
|
params.rms_norm_add = false;
|
||||||
|
params.normalize_input = true;
|
||||||
|
params.max_position_embeddings = 131072;
|
||||||
|
params.mlp_activation = MLPActivation::GELU_TANH;
|
||||||
|
params.rope_thetas = {1000000.f, 10000.f};
|
||||||
|
params.rope_scales = {8.f, 1.f};
|
||||||
|
params.sliding_attention = {1024, 1024, 1024, 1024, 1024, 0};
|
||||||
}
|
}
|
||||||
bool have_vision_weight = false;
|
bool have_vision_weight = false;
|
||||||
bool llama_cpp_style = false;
|
bool llama_cpp_style = false;
|
||||||
@ -722,9 +924,18 @@ namespace LLM {
|
|||||||
ggml_tensor* input_ids,
|
ggml_tensor* input_ids,
|
||||||
ggml_tensor* input_pos,
|
ggml_tensor* input_pos,
|
||||||
ggml_tensor* attention_mask,
|
ggml_tensor* attention_mask,
|
||||||
|
ggml_tensor* sliding_attention_mask,
|
||||||
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
||||||
std::set<int> out_layers) {
|
std::set<int> out_layers,
|
||||||
auto hidden_states = model.forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); // [N, n_token, hidden_size]
|
bool return_all_hidden_states = false) {
|
||||||
|
auto hidden_states = model.forward(ctx,
|
||||||
|
input_ids,
|
||||||
|
input_pos,
|
||||||
|
attention_mask,
|
||||||
|
sliding_attention_mask,
|
||||||
|
image_embeds,
|
||||||
|
out_layers,
|
||||||
|
return_all_hidden_states); // [N, n_token, hidden_size]
|
||||||
return hidden_states;
|
return hidden_states;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -741,8 +952,9 @@ namespace LLM {
|
|||||||
ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
|
ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
|
||||||
const sd::Tensor<float>& attention_mask_tensor,
|
const sd::Tensor<float>& attention_mask_tensor,
|
||||||
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds_tensor,
|
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds_tensor,
|
||||||
std::set<int> out_layers) {
|
std::set<int> out_layers,
|
||||||
ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
bool return_all_hidden_states = false) {
|
||||||
|
ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE);
|
||||||
ggml_tensor* input_ids = make_input(input_ids_tensor);
|
ggml_tensor* input_ids = make_input(input_ids_tensor);
|
||||||
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
|
||||||
image_embeds.reserve(image_embeds_tensor.size());
|
image_embeds.reserve(image_embeds_tensor.size());
|
||||||
@ -752,7 +964,10 @@ namespace LLM {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int64_t n_tokens = input_ids->ne[0];
|
int64_t n_tokens = input_ids->ne[0];
|
||||||
if (params.arch == LLMArch::MISTRAL_SMALL_3_2 || params.arch == LLMArch::MINISTRAL_3_3B || params.arch == LLMArch::QWEN3) {
|
if (params.arch == LLMArch::MISTRAL_SMALL_3_2 ||
|
||||||
|
params.arch == LLMArch::MINISTRAL_3_3B ||
|
||||||
|
params.arch == LLMArch::QWEN3 ||
|
||||||
|
params.arch == LLMArch::GEMMA3_12B) {
|
||||||
input_pos_vec.resize(n_tokens);
|
input_pos_vec.resize(n_tokens);
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
input_pos_vec[i] = i;
|
input_pos_vec[i] = i;
|
||||||
@ -772,7 +987,8 @@ namespace LLM {
|
|||||||
input_pos_vec.size());
|
input_pos_vec.size());
|
||||||
set_backend_tensor_data(input_pos, input_pos_vec.data());
|
set_backend_tensor_data(input_pos, input_pos_vec.data());
|
||||||
|
|
||||||
ggml_tensor* attention_mask = nullptr;
|
ggml_tensor* attention_mask = nullptr;
|
||||||
|
ggml_tensor* sliding_attention_mask = nullptr;
|
||||||
if (!attention_mask_tensor.empty()) {
|
if (!attention_mask_tensor.empty()) {
|
||||||
attention_mask = make_input(attention_mask_tensor);
|
attention_mask = make_input(attention_mask_tensor);
|
||||||
} else {
|
} else {
|
||||||
@ -790,9 +1006,36 @@ namespace LLM {
|
|||||||
set_backend_tensor_data(attention_mask, attention_mask_vec.data());
|
set_backend_tensor_data(attention_mask, attention_mask_vec.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.arch == LLMArch::GEMMA3_12B) {
|
||||||
|
sliding_attention_mask_vec.resize(n_tokens * n_tokens);
|
||||||
|
if (!attention_mask_tensor.empty()) {
|
||||||
|
GGML_ASSERT(attention_mask_tensor.numel() == n_tokens * n_tokens);
|
||||||
|
sliding_attention_mask_vec = attention_mask_tensor.values();
|
||||||
|
} else {
|
||||||
|
sliding_attention_mask_vec = attention_mask_vec;
|
||||||
|
}
|
||||||
|
for (int i0 = 0; i0 < n_tokens; i0++) {
|
||||||
|
for (int i1 = 0; i1 < n_tokens; i1++) {
|
||||||
|
if (i0 + 1024 <= i1) {
|
||||||
|
LOG_DEBUG("xxxxxxxxxxxxxx");
|
||||||
|
sliding_attention_mask_vec[i1 * n_tokens + i0] = -INFINITY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sliding_attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_tokens, n_tokens);
|
||||||
|
set_backend_tensor_data(sliding_attention_mask, sliding_attention_mask_vec.data());
|
||||||
|
}
|
||||||
|
|
||||||
auto runner_ctx = get_context();
|
auto runner_ctx = get_context();
|
||||||
|
|
||||||
ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
|
ggml_tensor* hidden_states = forward(&runner_ctx,
|
||||||
|
input_ids,
|
||||||
|
input_pos,
|
||||||
|
attention_mask,
|
||||||
|
sliding_attention_mask,
|
||||||
|
image_embeds,
|
||||||
|
out_layers,
|
||||||
|
return_all_hidden_states);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, hidden_states);
|
ggml_build_forward_expand(gf, hidden_states);
|
||||||
|
|
||||||
@ -803,9 +1046,14 @@ namespace LLM {
|
|||||||
const sd::Tensor<int32_t>& input_ids,
|
const sd::Tensor<int32_t>& input_ids,
|
||||||
const sd::Tensor<float>& attention_mask,
|
const sd::Tensor<float>& attention_mask,
|
||||||
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
|
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
|
||||||
std::set<int> out_layers) {
|
std::set<int> out_layers,
|
||||||
|
bool return_all_hidden_states = false) {
|
||||||
auto get_graph = [&]() -> ggml_cgraph* {
|
auto get_graph = [&]() -> ggml_cgraph* {
|
||||||
return build_graph(input_ids, attention_mask, image_embeds, out_layers);
|
return build_graph(input_ids,
|
||||||
|
attention_mask,
|
||||||
|
image_embeds,
|
||||||
|
out_layers,
|
||||||
|
return_all_hidden_states);
|
||||||
};
|
};
|
||||||
return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
|
return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
|
||||||
}
|
}
|
||||||
|
|||||||
971
src/ltx_vae.hpp
Normal file
971
src/ltx_vae.hpp
Normal file
@ -0,0 +1,971 @@
|
|||||||
|
#ifndef __SD_LTX_VAE_HPP__
|
||||||
|
#define __SD_LTX_VAE_HPP__
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <tuple>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "ltxv.hpp"
|
||||||
|
#include "vae.hpp"
|
||||||
|
#include "wan.hpp"
|
||||||
|
|
||||||
|
namespace LTXVAE {
|
||||||
|
|
||||||
|
static inline ggml_tensor* apply_scale_shift(ggml_context* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
ggml_tensor* scale,
|
||||||
|
ggml_tensor* shift) {
|
||||||
|
x = ggml_add(ctx, x, ggml_mul(ctx, x, scale));
|
||||||
|
x = ggml_add(ctx, x, shift);
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ggml_tensor* reshape_channel_broadcast(ggml_context* ctx,
|
||||||
|
ggml_tensor* x) {
|
||||||
|
return ggml_reshape_4d(ctx, x, 1, 1, 1, ggml_nelements(x));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline std::pair<ggml_tensor*, ggml_tensor*> get_shift_scale(ggml_context* ctx,
|
||||||
|
ggml_tensor* table,
|
||||||
|
ggml_tensor* timestep,
|
||||||
|
int64_t channels,
|
||||||
|
int parts) {
|
||||||
|
GGML_ASSERT(timestep != nullptr);
|
||||||
|
GGML_ASSERT(ggml_nelements(timestep) == channels * parts);
|
||||||
|
|
||||||
|
auto timestep_view = ggml_reshape_2d(ctx, timestep, channels, parts);
|
||||||
|
auto values = ggml_add(ctx, table, timestep_view);
|
||||||
|
auto chunks = ggml_ext_chunk(ctx, values, parts, 1, false);
|
||||||
|
auto shift = reshape_channel_broadcast(ctx, ggml_cont(ctx, chunks[0]));
|
||||||
|
auto scale = reshape_channel_broadcast(ctx, ggml_cont(ctx, chunks[1]));
|
||||||
|
return {shift, scale};
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ggml_tensor* depth_to_space_3d(ggml_context* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
int64_t c,
|
||||||
|
int factor_t,
|
||||||
|
int factor_s,
|
||||||
|
bool drop_first_temporal_frame) {
|
||||||
|
// x: [B*c*p1*p2*p3, T, H, W], B == 1, p2 == p3 == factor_s, p1 == factor_t
|
||||||
|
// return: [B*c, T*p1, H*p2, W*p2]
|
||||||
|
// Match: rearrange(x, "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)")
|
||||||
|
const int64_t T = x->ne[2];
|
||||||
|
const int64_t H = x->ne[1];
|
||||||
|
const int64_t W = x->ne[0];
|
||||||
|
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 1, 3, 2)); // [T, C, H, W]
|
||||||
|
x = ggml_reshape_4d(ctx, x, W, H, factor_s, factor_s * factor_t * c * T); // [T*c*p1*p2, p3, H, W]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3)); // [T*c*p1*p2, H, W, p3]
|
||||||
|
x = ggml_reshape_4d(ctx, x, factor_s * W, H, factor_s, factor_t * c * T); // [T*c*p1, p2, H, W*p3]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [T*c*p1, H, p2, W*p3]
|
||||||
|
x = ggml_reshape_4d(ctx, x, factor_s * W * factor_s * H, factor_t, c, T); // [T, c, p1, H*p2*W*p3]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 1, 3, 2)); // [c, T, p1, H*p2*W*p3]
|
||||||
|
x = ggml_reshape_4d(ctx, x, factor_s * W, factor_s * H, factor_t * T, c); // [T, c, T*p1, H*p2*W*p3]
|
||||||
|
|
||||||
|
if (drop_first_temporal_frame && factor_t > 1 && x->ne[2] > 0) {
|
||||||
|
x = ggml_ext_slice(ctx, x, 2, 1, x->ne[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ggml_tensor* patchify(ggml_context* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
int patch_size) {
|
||||||
|
return WAN::WanVAE::patchify(ctx, x, patch_size, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
class CausalConv3d : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int time_kernel_size;
|
||||||
|
|
||||||
|
public:
|
||||||
|
CausalConv3d(int64_t in_channels,
|
||||||
|
int64_t out_channels,
|
||||||
|
int kernel_size = 3,
|
||||||
|
std::tuple<int, int, int> stride = {1, 1, 1},
|
||||||
|
int dilation = 1,
|
||||||
|
bool bias = true) {
|
||||||
|
time_kernel_size = kernel_size;
|
||||||
|
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
|
||||||
|
out_channels,
|
||||||
|
{kernel_size, kernel_size, kernel_size},
|
||||||
|
stride,
|
||||||
|
{0, kernel_size / 2, kernel_size / 2},
|
||||||
|
{dilation, 1, 1},
|
||||||
|
bias));
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
bool causal = true) {
|
||||||
|
// x: [B*C, T, H, W], B == 1
|
||||||
|
auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);
|
||||||
|
|
||||||
|
if (causal) {
|
||||||
|
auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
|
||||||
|
auto first_frame_pad = first_frame;
|
||||||
|
for (int i = 1; i < time_kernel_size - 1; i++) {
|
||||||
|
first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
|
||||||
|
}
|
||||||
|
x = ggml_concat(ctx->ggml_ctx, first_frame_pad, x, 2);
|
||||||
|
} else {
|
||||||
|
auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
|
||||||
|
auto first_frame_pad = first_frame;
|
||||||
|
for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
|
||||||
|
first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto last_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, x->ne[2] - 1, x->ne[2]);
|
||||||
|
auto last_frame_pad = last_frame;
|
||||||
|
for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
|
||||||
|
last_frame_pad = ggml_concat(ctx->ggml_ctx, last_frame_pad, last_frame, 2);
|
||||||
|
}
|
||||||
|
x = ggml_concat(ctx->ggml_ctx, first_frame_pad, x, 2);
|
||||||
|
x = ggml_concat(ctx->ggml_ctx, x, last_frame_pad, 2);
|
||||||
|
}
|
||||||
|
return conv->forward(ctx, x);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct PixelNorm3D : public UnaryBlock {
|
||||||
|
float eps;
|
||||||
|
|
||||||
|
PixelNorm3D(float eps = 1e-8f)
|
||||||
|
: eps(eps) {}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
|
||||||
|
auto h = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 3, 0, 1, 2));
|
||||||
|
h = ggml_rms_norm(ctx->ggml_ctx, h, eps);
|
||||||
|
h = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, h, 1, 2, 3, 0));
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct PixArtAlphaCombinedTimestepSizeEmbeddings : public GGMLBlock {
|
||||||
|
int64_t embedding_dim;
|
||||||
|
|
||||||
|
PixArtAlphaCombinedTimestepSizeEmbeddings(int64_t embedding_dim)
|
||||||
|
: embedding_dim(embedding_dim) {
|
||||||
|
blocks["timestep_embedder"] = std::make_shared<LTXV::TimestepEmbedder>(embedding_dim);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* timestep) {
|
||||||
|
auto timestep_embedder = std::dynamic_pointer_cast<LTXV::TimestepEmbedder>(blocks["timestep_embedder"]);
|
||||||
|
return timestep_embedder->forward(ctx, timestep);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ResnetBlock3D : public GGMLBlock {
|
||||||
|
int64_t channels;
|
||||||
|
bool timestep_conditioning;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void init_params(ggml_context* ctx,
|
||||||
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
|
const std::string prefix = "") override {
|
||||||
|
if (timestep_conditioning) {
|
||||||
|
params["scale_shift_table"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, channels, 4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
ResnetBlock3D(int64_t channels,
|
||||||
|
float eps = 1e-6f,
|
||||||
|
bool timestep_conditioning = false)
|
||||||
|
: channels(channels), timestep_conditioning(timestep_conditioning) {
|
||||||
|
blocks["norm1"] = std::make_shared<PixelNorm3D>(eps);
|
||||||
|
blocks["conv1"] = std::make_shared<CausalConv3d>(channels, channels, 3);
|
||||||
|
blocks["norm2"] = std::make_shared<PixelNorm3D>(eps);
|
||||||
|
blocks["conv2"] = std::make_shared<CausalConv3d>(channels, channels, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
ggml_tensor* timestep = nullptr,
|
||||||
|
bool causal = false) {
|
||||||
|
auto norm1 = std::dynamic_pointer_cast<PixelNorm3D>(blocks["norm1"]);
|
||||||
|
auto conv1 = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv1"]);
|
||||||
|
auto norm2 = std::dynamic_pointer_cast<PixelNorm3D>(blocks["norm2"]);
|
||||||
|
auto conv2 = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv2"]);
|
||||||
|
|
||||||
|
ggml_tensor* shift1 = nullptr;
|
||||||
|
ggml_tensor* scale1 = nullptr;
|
||||||
|
ggml_tensor* shift2 = nullptr;
|
||||||
|
ggml_tensor* scale2 = nullptr;
|
||||||
|
if (timestep_conditioning) {
|
||||||
|
GGML_ASSERT(timestep != nullptr);
|
||||||
|
auto values = ggml_add(ctx->ggml_ctx,
|
||||||
|
params["scale_shift_table"],
|
||||||
|
ggml_reshape_2d(ctx->ggml_ctx, timestep, channels, 4));
|
||||||
|
auto chunks = ggml_ext_chunk(ctx->ggml_ctx, values, 4, 1, false);
|
||||||
|
shift1 = reshape_channel_broadcast(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, chunks[0]));
|
||||||
|
scale1 = reshape_channel_broadcast(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, chunks[1]));
|
||||||
|
shift2 = reshape_channel_broadcast(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, chunks[2]));
|
||||||
|
scale2 = reshape_channel_broadcast(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, chunks[3]));
|
||||||
|
}
|
||||||
|
|
||||||
|
auto h = norm1->forward(ctx, x);
|
||||||
|
if (timestep_conditioning) {
|
||||||
|
h = apply_scale_shift(ctx->ggml_ctx, h, scale1, shift1);
|
||||||
|
}
|
||||||
|
h = ggml_silu_inplace(ctx->ggml_ctx, h);
|
||||||
|
h = conv1->forward(ctx, h, causal);
|
||||||
|
|
||||||
|
h = norm2->forward(ctx, h);
|
||||||
|
if (timestep_conditioning) {
|
||||||
|
h = apply_scale_shift(ctx->ggml_ctx, h, scale2, shift2);
|
||||||
|
}
|
||||||
|
h = ggml_silu_inplace(ctx->ggml_ctx, h);
|
||||||
|
h = conv2->forward(ctx, h, causal);
|
||||||
|
|
||||||
|
return ggml_add(ctx->ggml_ctx, h, x);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct UNetMidBlock3D : public GGMLBlock {
|
||||||
|
int64_t channels;
|
||||||
|
int num_layers;
|
||||||
|
bool timestep_conditioning;
|
||||||
|
|
||||||
|
UNetMidBlock3D(int64_t channels,
|
||||||
|
int num_layers,
|
||||||
|
bool timestep_conditioning)
|
||||||
|
: channels(channels),
|
||||||
|
num_layers(num_layers),
|
||||||
|
timestep_conditioning(timestep_conditioning) {
|
||||||
|
if (timestep_conditioning) {
|
||||||
|
blocks["time_embedder"] = std::make_shared<PixArtAlphaCombinedTimestepSizeEmbeddings>(channels * 4);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < num_layers; i++) {
|
||||||
|
blocks["res_blocks." + std::to_string(i)] = std::make_shared<ResnetBlock3D>(channels, 1e-6f, timestep_conditioning);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
ggml_tensor* timestep = nullptr,
|
||||||
|
bool causal = false) {
|
||||||
|
ggml_tensor* timestep_embed = nullptr;
|
||||||
|
if (timestep_conditioning) {
|
||||||
|
GGML_ASSERT(timestep != nullptr);
|
||||||
|
auto time_embedder = std::dynamic_pointer_cast<PixArtAlphaCombinedTimestepSizeEmbeddings>(blocks["time_embedder"]);
|
||||||
|
timestep_embed = time_embedder->forward(ctx, timestep);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < num_layers; i++) {
|
||||||
|
auto resnet = std::dynamic_pointer_cast<ResnetBlock3D>(blocks["res_blocks." + std::to_string(i)]);
|
||||||
|
x = resnet->forward(ctx, x, timestep_embed, causal);
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct DepthToSpaceUpsample : public GGMLBlock {
|
||||||
|
int64_t in_channels;
|
||||||
|
int factor_t;
|
||||||
|
int factor_s;
|
||||||
|
int out_channels_reduction_factor;
|
||||||
|
bool residual;
|
||||||
|
|
||||||
|
DepthToSpaceUpsample(int64_t in_channels,
|
||||||
|
int factor_t = 2,
|
||||||
|
int factor_s = 2,
|
||||||
|
int out_channels_reduction_factor = 2,
|
||||||
|
bool residual = true)
|
||||||
|
: in_channels(in_channels),
|
||||||
|
factor_t(factor_t),
|
||||||
|
factor_s(factor_s),
|
||||||
|
out_channels_reduction_factor(out_channels_reduction_factor),
|
||||||
|
residual(residual) {
|
||||||
|
const int64_t factor = static_cast<int64_t>(factor_t) * static_cast<int64_t>(factor_s) * static_cast<int64_t>(factor_s);
|
||||||
|
const int64_t out_dim = (factor * in_channels) / out_channels_reduction_factor;
|
||||||
|
blocks["conv"] = std::make_shared<CausalConv3d>(in_channels, out_dim, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_output_channels() const {
|
||||||
|
return in_channels / out_channels_reduction_factor;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
bool causal = false) {
|
||||||
|
auto conv = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv"]);
|
||||||
|
|
||||||
|
ggml_tensor* x_in = nullptr;
|
||||||
|
if (residual) {
|
||||||
|
x_in = depth_to_space_3d(ctx->ggml_ctx, x, in_channels / (factor_t * factor_s * factor_s), factor_t, factor_s, factor_t > 1);
|
||||||
|
int repeat = (factor_t * factor_s * factor_s) / out_channels_reduction_factor;
|
||||||
|
auto res = x_in;
|
||||||
|
for (int i = 1; i < repeat; i++) {
|
||||||
|
res = ggml_concat(ctx->ggml_ctx, res, x_in, 3);
|
||||||
|
}
|
||||||
|
x_in = res;
|
||||||
|
}
|
||||||
|
|
||||||
|
x = conv->forward(ctx, x, causal);
|
||||||
|
x = depth_to_space_3d(ctx->ggml_ctx, x, get_output_channels(), factor_t, factor_s, factor_t > 1);
|
||||||
|
if (residual) {
|
||||||
|
x = ggml_add(ctx->ggml_ctx, x, x_in);
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SpaceToDepthDownsample : public GGMLBlock {
|
||||||
|
int64_t in_channels;
|
||||||
|
int64_t out_channels;
|
||||||
|
int factor_t;
|
||||||
|
int factor_s;
|
||||||
|
|
||||||
|
SpaceToDepthDownsample(int64_t in_channels,
|
||||||
|
int64_t out_channels,
|
||||||
|
int factor_t,
|
||||||
|
int factor_s)
|
||||||
|
: in_channels(in_channels),
|
||||||
|
out_channels(out_channels),
|
||||||
|
factor_t(factor_t),
|
||||||
|
factor_s(factor_s) {
|
||||||
|
const int64_t factor = static_cast<int64_t>(factor_t) * static_cast<int64_t>(factor_s) * static_cast<int64_t>(factor_s);
|
||||||
|
GGML_ASSERT(out_channels % factor == 0);
|
||||||
|
|
||||||
|
blocks["conv"] = std::make_shared<CausalConv3d>(in_channels, out_channels / factor, 3);
|
||||||
|
blocks["skip_downsample"] = std::make_shared<WAN::AvgDown3D>(in_channels, out_channels, factor_t, factor_s);
|
||||||
|
blocks["conv_downsample"] = std::make_shared<WAN::AvgDown3D>(out_channels / factor, out_channels, factor_t, factor_s);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
bool causal = true) {
|
||||||
|
auto conv = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv"]);
|
||||||
|
auto skip_downsample = std::dynamic_pointer_cast<WAN::AvgDown3D>(blocks["skip_downsample"]);
|
||||||
|
auto conv_downsample = std::dynamic_pointer_cast<WAN::AvgDown3D>(blocks["conv_downsample"]);
|
||||||
|
|
||||||
|
if (factor_t > 1 && x->ne[2] > 0) {
|
||||||
|
auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
|
||||||
|
auto first_frame_pad = first_frame;
|
||||||
|
for (int i = 1; i < factor_t; ++i) {
|
||||||
|
first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
|
||||||
|
}
|
||||||
|
x = ggml_concat(ctx->ggml_ctx, first_frame_pad, x, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto residual = skip_downsample->forward(ctx, x);
|
||||||
|
auto h = conv->forward(ctx, x, causal);
|
||||||
|
h = conv_downsample->forward(ctx, h);
|
||||||
|
return ggml_add(ctx->ggml_ctx, h, residual);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct PerChannelStatistics : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
void init_params(ggml_context* ctx,
|
||||||
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
|
const std::string prefix = "") override {
|
||||||
|
params["std-of-means"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128);
|
||||||
|
params["mean-of-means"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128);
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
ggml_tensor* un_normalize(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* x) {
|
||||||
|
auto std_tensor = reshape_channel_broadcast(ctx->ggml_ctx, params["std-of-means"]);
|
||||||
|
auto mean_tensor = reshape_channel_broadcast(ctx->ggml_ctx, params["mean-of-means"]);
|
||||||
|
return ggml_add(ctx->ggml_ctx, ggml_mul(ctx->ggml_ctx, x, std_tensor), mean_tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* normalize(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* x) {
|
||||||
|
auto std_tensor = reshape_channel_broadcast(ctx->ggml_ctx, params["std-of-means"]);
|
||||||
|
auto mean_tensor = reshape_channel_broadcast(ctx->ggml_ctx, params["mean-of-means"]);
|
||||||
|
return ggml_div(ctx->ggml_ctx, ggml_sub(ctx->ggml_ctx, x, mean_tensor), std_tensor);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct DecoderConfig {
|
||||||
|
struct Block {
|
||||||
|
std::string type;
|
||||||
|
int num_layers = 0;
|
||||||
|
int multiplier = 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<Block> blocks;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct EncoderConfig {
|
||||||
|
struct Block {
|
||||||
|
std::string type;
|
||||||
|
int num_layers = 0;
|
||||||
|
int multiplier = 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<Block> blocks;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline bool has_tensor(const String2TensorStorage& tensor_storage_map,
|
||||||
|
const std::string& name) {
|
||||||
|
return tensor_storage_map.find(name) != tensor_storage_map.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int64_t get_tensor_ne0(const String2TensorStorage& tensor_storage_map,
|
||||||
|
const std::string& name,
|
||||||
|
int64_t fallback = 0) {
|
||||||
|
auto iter = tensor_storage_map.find(name);
|
||||||
|
if (iter == tensor_storage_map.end()) {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
return iter->second.ne[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline DecoderConfig infer_decoder_config_from_weights(const String2TensorStorage& tensor_storage_map,
|
||||||
|
const std::string& prefix,
|
||||||
|
int64_t conv_in_channels) {
|
||||||
|
DecoderConfig cfg;
|
||||||
|
const std::string decoder_prefix = prefix + ".decoder.up_blocks.";
|
||||||
|
|
||||||
|
int64_t current_channels = conv_in_channels;
|
||||||
|
for (int block_idx = 0;; ++block_idx) {
|
||||||
|
const std::string block_prefix = decoder_prefix + std::to_string(block_idx);
|
||||||
|
const std::string res0_bias = block_prefix + ".res_blocks.0.conv1.conv.bias";
|
||||||
|
const std::string conv_bias = block_prefix + ".conv.conv.bias";
|
||||||
|
|
||||||
|
if (has_tensor(tensor_storage_map, res0_bias)) {
|
||||||
|
int num_layers = 0;
|
||||||
|
while (has_tensor(tensor_storage_map,
|
||||||
|
block_prefix + ".res_blocks." + std::to_string(num_layers) + ".conv1.conv.bias")) {
|
||||||
|
num_layers++;
|
||||||
|
}
|
||||||
|
cfg.blocks.push_back({"res_x", num_layers, 1});
|
||||||
|
current_channels = get_tensor_ne0(tensor_storage_map, res0_bias, current_channels);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!has_tensor(tensor_storage_map, conv_bias)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t next_channels = 0;
|
||||||
|
for (int next_idx = block_idx + 1;; ++next_idx) {
|
||||||
|
const std::string next_res0_bias = decoder_prefix + std::to_string(next_idx) + ".res_blocks.0.conv1.conv.bias";
|
||||||
|
const std::string next_conv_bias = decoder_prefix + std::to_string(next_idx) + ".conv.conv.bias";
|
||||||
|
if (has_tensor(tensor_storage_map, next_res0_bias)) {
|
||||||
|
next_channels = get_tensor_ne0(tensor_storage_map, next_res0_bias);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!has_tensor(tensor_storage_map, next_conv_bias)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (next_channels <= 0 || current_channels % next_channels != 0) {
|
||||||
|
next_channels = std::max<int64_t>(1, current_channels / 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t conv_out_dim = get_tensor_ne0(tensor_storage_map, conv_bias);
|
||||||
|
const int64_t reduction = std::max<int64_t>(1, current_channels / next_channels);
|
||||||
|
const int64_t factor = next_channels > 0 ? conv_out_dim / next_channels : 0;
|
||||||
|
|
||||||
|
if (factor == 8) {
|
||||||
|
cfg.blocks.push_back({"compress_all", 0, static_cast<int>(reduction)});
|
||||||
|
} else if (factor == 4) {
|
||||||
|
cfg.blocks.push_back({"compress_space", 0, static_cast<int>(reduction)});
|
||||||
|
} else if (factor == 2) {
|
||||||
|
cfg.blocks.push_back({"compress_time", 0, static_cast<int>(reduction)});
|
||||||
|
} else {
|
||||||
|
LOG_WARN("unexpected LTX VAE upsample factor at '%s': conv_out=%lld current=%lld next=%lld, falling back to compress_all x%d",
|
||||||
|
block_prefix.c_str(),
|
||||||
|
(long long)conv_out_dim,
|
||||||
|
(long long)current_channels,
|
||||||
|
(long long)next_channels,
|
||||||
|
(int)reduction);
|
||||||
|
cfg.blocks.push_back({"compress_all", 0, static_cast<int>(reduction)});
|
||||||
|
}
|
||||||
|
current_channels = next_channels;
|
||||||
|
}
|
||||||
|
|
||||||
|
return cfg;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int detect_ltx_vae_version(const String2TensorStorage& tensor_storage_map,
|
||||||
|
const std::string& prefix) {
|
||||||
|
const std::string v2_probe = prefix + ".encoder.down_blocks.1.conv.conv.bias";
|
||||||
|
if (tensor_storage_map.find(v2_probe) != tensor_storage_map.end()) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool detect_ltx_vae_timestep_conditioning(const String2TensorStorage& tensor_storage_map,
|
||||||
|
const std::string& prefix) {
|
||||||
|
return tensor_storage_map.find(prefix + ".decoder.timestep_scale_multiplier") != tensor_storage_map.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline EncoderConfig get_encoder_config(int version) {
|
||||||
|
EncoderConfig cfg;
|
||||||
|
if (version < 2) {
|
||||||
|
GGML_ABORT("LTX VAE encoder is only implemented for version >= 2");
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg.blocks = {
|
||||||
|
{"res_x", 4, 1},
|
||||||
|
{"compress_space_res", 0, 2},
|
||||||
|
{"res_x", 6, 1},
|
||||||
|
{"compress_time_res", 0, 2},
|
||||||
|
{"res_x", 6, 1},
|
||||||
|
{"compress_all_res", 0, 2},
|
||||||
|
{"res_x", 2, 1},
|
||||||
|
{"compress_all_res", 0, 2},
|
||||||
|
{"res_x", 2, 1},
|
||||||
|
};
|
||||||
|
return cfg;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Encoder : public GGMLBlock {
|
||||||
|
int version;
|
||||||
|
int patch_size;
|
||||||
|
int64_t in_channels;
|
||||||
|
int64_t latent_channels;
|
||||||
|
|
||||||
|
Encoder(int version,
|
||||||
|
int patch_size = 4,
|
||||||
|
int64_t in_channels = 3,
|
||||||
|
int64_t latent_channels = 128)
|
||||||
|
: version(version),
|
||||||
|
patch_size(patch_size),
|
||||||
|
in_channels(in_channels),
|
||||||
|
latent_channels(latent_channels) {
|
||||||
|
auto cfg = get_encoder_config(version);
|
||||||
|
int64_t channels = 128;
|
||||||
|
int64_t in_dim = in_channels * patch_size * patch_size;
|
||||||
|
|
||||||
|
blocks["conv_in"] = std::make_shared<CausalConv3d>(in_dim, channels, 3);
|
||||||
|
|
||||||
|
for (int block_idx = 0; block_idx < static_cast<int>(cfg.blocks.size()); ++block_idx) {
|
||||||
|
const auto& block = cfg.blocks[block_idx];
|
||||||
|
if (block.type == "res_x") {
|
||||||
|
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<UNetMidBlock3D>(channels,
|
||||||
|
block.num_layers,
|
||||||
|
false);
|
||||||
|
} else if (block.type == "compress_space_res") {
|
||||||
|
int64_t next_channels = channels * block.multiplier;
|
||||||
|
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
|
||||||
|
next_channels,
|
||||||
|
1,
|
||||||
|
2);
|
||||||
|
channels = next_channels;
|
||||||
|
} else if (block.type == "compress_time_res") {
|
||||||
|
int64_t next_channels = channels * block.multiplier;
|
||||||
|
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
|
||||||
|
next_channels,
|
||||||
|
2,
|
||||||
|
1);
|
||||||
|
channels = next_channels;
|
||||||
|
} else if (block.type == "compress_all_res") {
|
||||||
|
int64_t next_channels = channels * block.multiplier;
|
||||||
|
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
|
||||||
|
next_channels,
|
||||||
|
2,
|
||||||
|
2);
|
||||||
|
channels = next_channels;
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("Unsupported LTX VAE encoder block");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
blocks["conv_norm_out"] = std::make_shared<PixelNorm3D>();
|
||||||
|
blocks["conv_out"] = std::make_shared<CausalConv3d>(channels, latent_channels + 1, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* x) {
|
||||||
|
auto conv_in = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv_in"]);
|
||||||
|
auto conv_norm_out = std::dynamic_pointer_cast<PixelNorm3D>(blocks["conv_norm_out"]);
|
||||||
|
auto conv_out = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv_out"]);
|
||||||
|
|
||||||
|
x = conv_in->forward(ctx, x, true);
|
||||||
|
|
||||||
|
int block_idx = 0;
|
||||||
|
while (blocks.find("down_blocks." + std::to_string(block_idx)) != blocks.end()) {
|
||||||
|
auto mid_block = std::dynamic_pointer_cast<UNetMidBlock3D>(blocks["down_blocks." + std::to_string(block_idx)]);
|
||||||
|
if (mid_block) {
|
||||||
|
x = mid_block->forward(ctx, x, nullptr, true);
|
||||||
|
} else {
|
||||||
|
auto downsample = std::dynamic_pointer_cast<SpaceToDepthDownsample>(blocks["down_blocks." + std::to_string(block_idx)]);
|
||||||
|
x = downsample->forward(ctx, x, true);
|
||||||
|
}
|
||||||
|
block_idx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
x = conv_norm_out->forward(ctx, x);
|
||||||
|
x = ggml_silu_inplace(ctx->ggml_ctx, x);
|
||||||
|
x = conv_out->forward(ctx, x, true);
|
||||||
|
|
||||||
|
auto last_channel = ggml_ext_slice(ctx->ggml_ctx, x, 3, x->ne[3] - 1, x->ne[3]);
|
||||||
|
auto repeat_shape = ggml_new_tensor_4d(ctx->ggml_ctx, last_channel->type, last_channel->ne[0], last_channel->ne[1], last_channel->ne[2], latent_channels - 1);
|
||||||
|
auto repeated = ggml_repeat(ctx->ggml_ctx, last_channel, repeat_shape);
|
||||||
|
return ggml_concat(ctx->ggml_ctx, x, repeated, 3);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Decoder : public GGMLBlock {
|
||||||
|
int version;
|
||||||
|
int patch_size;
|
||||||
|
bool causal_decoder;
|
||||||
|
bool timestep_conditioning;
|
||||||
|
int64_t in_channels;
|
||||||
|
int64_t hidden_channels;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void init_params(ggml_context* ctx,
|
||||||
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
|
const std::string prefix = "") override {
|
||||||
|
if (timestep_conditioning) {
|
||||||
|
params["timestep_scale_multiplier"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
||||||
|
params["last_scale_shift_table"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_channels, 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
Decoder(int version,
|
||||||
|
const String2TensorStorage& tensor_storage_map,
|
||||||
|
const std::string& prefix,
|
||||||
|
int patch_size = 4,
|
||||||
|
bool causal_decoder = false,
|
||||||
|
bool timestep_conditioning = true,
|
||||||
|
int64_t in_channels = 128,
|
||||||
|
int64_t hidden_channels = 128)
|
||||||
|
: version(version),
|
||||||
|
patch_size(patch_size),
|
||||||
|
causal_decoder(causal_decoder),
|
||||||
|
timestep_conditioning(timestep_conditioning),
|
||||||
|
in_channels(in_channels),
|
||||||
|
hidden_channels(hidden_channels) {
|
||||||
|
const int64_t conv_in_out_channels = get_tensor_ne0(tensor_storage_map,
|
||||||
|
prefix + ".decoder.conv_in.conv.bias",
|
||||||
|
hidden_channels);
|
||||||
|
auto cfg = infer_decoder_config_from_weights(tensor_storage_map,
|
||||||
|
prefix,
|
||||||
|
conv_in_out_channels);
|
||||||
|
int64_t channels = conv_in_out_channels;
|
||||||
|
|
||||||
|
blocks["conv_in"] = std::make_shared<CausalConv3d>(in_channels, channels, 3);
|
||||||
|
|
||||||
|
for (int block_idx = 0; block_idx < static_cast<int>(cfg.blocks.size()); ++block_idx) {
|
||||||
|
const auto& block = cfg.blocks[block_idx];
|
||||||
|
if (block.type == "res_x") {
|
||||||
|
blocks["up_blocks." + std::to_string(block_idx)] = std::make_shared<UNetMidBlock3D>(channels,
|
||||||
|
block.num_layers,
|
||||||
|
timestep_conditioning);
|
||||||
|
} else if (block.type == "compress_all") {
|
||||||
|
blocks["up_blocks." + std::to_string(block_idx)] = std::make_shared<DepthToSpaceUpsample>(channels,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
block.multiplier,
|
||||||
|
false);
|
||||||
|
channels /= block.multiplier;
|
||||||
|
} else if (block.type == "compress_time") {
|
||||||
|
blocks["up_blocks." + std::to_string(block_idx)] = std::make_shared<DepthToSpaceUpsample>(channels,
|
||||||
|
2,
|
||||||
|
1,
|
||||||
|
block.multiplier,
|
||||||
|
false);
|
||||||
|
channels /= block.multiplier;
|
||||||
|
} else if (block.type == "compress_space") {
|
||||||
|
blocks["up_blocks." + std::to_string(block_idx)] = std::make_shared<DepthToSpaceUpsample>(channels,
|
||||||
|
1,
|
||||||
|
2,
|
||||||
|
block.multiplier,
|
||||||
|
false);
|
||||||
|
channels /= block.multiplier;
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("Unsupported LTX VAE decoder block");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
hidden_channels = channels;
|
||||||
|
blocks["conv_norm_out"] = std::make_shared<PixelNorm3D>();
|
||||||
|
blocks["conv_out"] = std::make_shared<CausalConv3d>(hidden_channels, 3 * patch_size * patch_size, 3);
|
||||||
|
if (timestep_conditioning) {
|
||||||
|
blocks["last_time_embedder"] = std::make_shared<PixArtAlphaCombinedTimestepSizeEmbeddings>(hidden_channels * 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
ggml_tensor* timestep) {
|
||||||
|
auto conv_in = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv_in"]);
|
||||||
|
auto conv_norm_out = std::dynamic_pointer_cast<PixelNorm3D>(blocks["conv_norm_out"]);
|
||||||
|
auto conv_out = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv_out"]);
|
||||||
|
|
||||||
|
ggml_tensor* scaled_timestep = timestep;
|
||||||
|
if (timestep_conditioning) {
|
||||||
|
auto multiplier = ggml_ext_backend_tensor_get_f32(params["timestep_scale_multiplier"]);
|
||||||
|
scaled_timestep = ggml_ext_scale(ctx->ggml_ctx, timestep, multiplier);
|
||||||
|
}
|
||||||
|
|
||||||
|
x = conv_in->forward(ctx, x, causal_decoder);
|
||||||
|
|
||||||
|
int block_idx = 0;
|
||||||
|
while (blocks.find("up_blocks." + std::to_string(block_idx)) != blocks.end()) {
|
||||||
|
auto mid_block = std::dynamic_pointer_cast<UNetMidBlock3D>(blocks["up_blocks." + std::to_string(block_idx)]);
|
||||||
|
if (mid_block) {
|
||||||
|
x = mid_block->forward(ctx, x, scaled_timestep, causal_decoder);
|
||||||
|
} else {
|
||||||
|
auto upsample = std::dynamic_pointer_cast<DepthToSpaceUpsample>(blocks["up_blocks." + std::to_string(block_idx)]);
|
||||||
|
x = upsample->forward(ctx, x, causal_decoder);
|
||||||
|
}
|
||||||
|
block_idx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
x = conv_norm_out->forward(ctx, x);
|
||||||
|
if (timestep_conditioning) {
|
||||||
|
auto last_time_embedder = std::dynamic_pointer_cast<PixArtAlphaCombinedTimestepSizeEmbeddings>(blocks["last_time_embedder"]);
|
||||||
|
auto timestep_embed = last_time_embedder->forward(ctx, scaled_timestep);
|
||||||
|
auto [shift, scale] = get_shift_scale(ctx->ggml_ctx,
|
||||||
|
params["last_scale_shift_table"],
|
||||||
|
timestep_embed,
|
||||||
|
hidden_channels,
|
||||||
|
2);
|
||||||
|
x = apply_scale_shift(ctx->ggml_ctx, x, scale, shift);
|
||||||
|
}
|
||||||
|
x = ggml_silu_inplace(ctx->ggml_ctx, x);
|
||||||
|
x = conv_out->forward(ctx, x, causal_decoder);
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct VideoVAE : public GGMLBlock {
|
||||||
|
int version;
|
||||||
|
float decode_timestep;
|
||||||
|
bool timestep_conditioning;
|
||||||
|
int patch_size;
|
||||||
|
bool decode_only;
|
||||||
|
|
||||||
|
VideoVAE(int version,
|
||||||
|
bool decode_only,
|
||||||
|
bool timestep_conditioning,
|
||||||
|
int patch_size,
|
||||||
|
const String2TensorStorage& tensor_storage_map,
|
||||||
|
const std::string& prefix,
|
||||||
|
float decode_timestep = 0.05f)
|
||||||
|
: version(version),
|
||||||
|
decode_timestep(decode_timestep),
|
||||||
|
timestep_conditioning(timestep_conditioning),
|
||||||
|
patch_size(patch_size),
|
||||||
|
decode_only(decode_only) {
|
||||||
|
if (!decode_only) {
|
||||||
|
blocks["encoder"] = std::make_shared<Encoder>(version, patch_size);
|
||||||
|
}
|
||||||
|
blocks["decoder"] = std::make_shared<Decoder>(version,
|
||||||
|
tensor_storage_map,
|
||||||
|
prefix,
|
||||||
|
patch_size,
|
||||||
|
false,
|
||||||
|
timestep_conditioning);
|
||||||
|
blocks["per_channel_statistics"] = std::make_shared<PerChannelStatistics>();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* decode(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* z,
|
||||||
|
ggml_tensor* timestep) {
|
||||||
|
auto decoder = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]);
|
||||||
|
auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
|
||||||
|
auto latents = processor->un_normalize(ctx, z);
|
||||||
|
auto out = decoder->forward(ctx, latents, timestep);
|
||||||
|
out = WAN::WanVAE::unpatchify(ctx->ggml_ctx, out, patch_size, 1);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* encode(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* x) {
|
||||||
|
GGML_ASSERT(!decode_only);
|
||||||
|
auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);
|
||||||
|
auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
|
||||||
|
|
||||||
|
x = patchify(ctx->ggml_ctx, x, patch_size);
|
||||||
|
auto out = encoder->forward(ctx, x);
|
||||||
|
auto mean = ggml_ext_chunk(ctx->ggml_ctx, out, 2, 3, false)[0];
|
||||||
|
mean = ggml_cont(ctx->ggml_ctx, mean);
|
||||||
|
return processor->normalize(ctx, mean);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace LTXVAE
|
||||||
|
|
||||||
|
struct LTXVideoVAE : public VAE {
|
||||||
|
bool decode_only;
|
||||||
|
int ltx_vae_version;
|
||||||
|
bool timestep_conditioning;
|
||||||
|
int patch_size;
|
||||||
|
sd::Tensor<float> decode_timestep_tensor;
|
||||||
|
LTXVAE::VideoVAE vae;
|
||||||
|
|
||||||
|
LTXVideoVAE(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
|
const String2TensorStorage& tensor_storage_map,
|
||||||
|
const std::string& prefix,
|
||||||
|
bool decode_only = true,
|
||||||
|
SDVersion version = VERSION_LTXAV)
|
||||||
|
: decode_only(decode_only),
|
||||||
|
ltx_vae_version(LTXVAE::detect_ltx_vae_version(tensor_storage_map, prefix)),
|
||||||
|
timestep_conditioning(LTXVAE::detect_ltx_vae_timestep_conditioning(tensor_storage_map, prefix)),
|
||||||
|
patch_size(4),
|
||||||
|
decode_timestep_tensor(sd::Tensor<float>::from_vector({0.05f})),
|
||||||
|
vae(LTXVAE::detect_ltx_vae_version(tensor_storage_map, prefix),
|
||||||
|
decode_only,
|
||||||
|
LTXVAE::detect_ltx_vae_timestep_conditioning(tensor_storage_map, prefix),
|
||||||
|
patch_size,
|
||||||
|
tensor_storage_map,
|
||||||
|
prefix),
|
||||||
|
VAE(version, backend, offload_params_to_cpu) {
|
||||||
|
vae.init(params_ctx, tensor_storage_map, prefix);
|
||||||
|
decode_timestep_tensor.values()[0] = vae.decode_timestep;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_desc() override {
|
||||||
|
return "ltx_video_vae";
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {
|
||||||
|
vae.get_param_tensors(tensors, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_cgraph* build_graph(const sd::Tensor<float>& z_tensor, bool decode_graph) {
|
||||||
|
LOG_DEBUG("ltx_video_vae build_graph input %dx%dx%dx%d",
|
||||||
|
(int)z_tensor.shape()[0],
|
||||||
|
(int)z_tensor.shape()[1],
|
||||||
|
(int)z_tensor.shape()[2],
|
||||||
|
(int)z_tensor.shape()[3]);
|
||||||
|
ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
||||||
|
ggml_tensor* z = make_input(z_tensor);
|
||||||
|
ggml_tensor* timestep = nullptr;
|
||||||
|
if (timestep_conditioning) {
|
||||||
|
timestep = make_input(decode_timestep_tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto runner_ctx = get_context();
|
||||||
|
ggml_tensor* out = decode_graph ? vae.decode(&runner_ctx, z, timestep) : vae.encode(&runner_ctx, z);
|
||||||
|
LOG_DEBUG("ltx_video_vae build_graph output ne=[%lld,%lld,%lld,%lld]",
|
||||||
|
(long long)out->ne[0],
|
||||||
|
(long long)out->ne[1],
|
||||||
|
(long long)out->ne[2],
|
||||||
|
(long long)out->ne[3]);
|
||||||
|
ggml_build_forward_expand(gf, out);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> _compute(const int n_threads,
|
||||||
|
const sd::Tensor<float>& z,
|
||||||
|
bool decode_graph) override {
|
||||||
|
if (!decode_graph && decode_only) {
|
||||||
|
LOG_ERROR("LTX video VAE encoder is not implemented yet");
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
sd::Tensor<float> input = z;
|
||||||
|
size_t expected_dim = static_cast<size_t>(z.dim());
|
||||||
|
if (!decode_graph) {
|
||||||
|
if (input.dim() == 4) {
|
||||||
|
input = input.unsqueeze(2);
|
||||||
|
expected_dim = 5;
|
||||||
|
} else if (input.dim() != 5) {
|
||||||
|
LOG_ERROR("LTX video VAE encoder expects 4D image or 5D video input, got dim=%lld",
|
||||||
|
(long long)input.dim());
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t cropped_t = std::max<int64_t>(1, 1 + ((input.shape()[2] - 1) / 8) * 8);
|
||||||
|
if (cropped_t != input.shape()[2]) {
|
||||||
|
input = sd::ops::slice(input, 2, 0, cropped_t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto get_graph = [&]() -> ggml_cgraph* {
|
||||||
|
return build_graph(input, decode_graph);
|
||||||
|
};
|
||||||
|
auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), expected_dim);
|
||||||
|
if (result.empty()) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
LOG_DEBUG("ltx_video_vae host output shape=[%lld,%lld,%lld,%lld] dim=%lld",
|
||||||
|
(long long)(result.shape().size() > 0 ? result.shape()[0] : 0),
|
||||||
|
(long long)(result.shape().size() > 1 ? result.shape()[1] : 0),
|
||||||
|
(long long)(result.shape().size() > 2 ? result.shape()[2] : 0),
|
||||||
|
(long long)(result.shape().size() > 3 ? result.shape()[3] : 0),
|
||||||
|
(long long)result.dim());
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
int get_encoder_output_channels(int input_channels) override {
|
||||||
|
SD_UNUSED(input_channels);
|
||||||
|
return 256;
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
|
||||||
|
SD_UNUSED(rng);
|
||||||
|
if (vae_output.dim() >= 4 && vae_output.shape()[3] > 128) {
|
||||||
|
return sd::ops::slice(vae_output, 3, 0, 128);
|
||||||
|
}
|
||||||
|
return vae_output;
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
|
||||||
|
return latents;
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
|
||||||
|
return latents;
|
||||||
|
}
|
||||||
|
|
||||||
|
void test(const std::string& input_path) {
|
||||||
|
auto z = sd::load_tensor_from_file_as_tensor<float>(input_path);
|
||||||
|
print_sd_tensor(z, false, "ltx_vae_z");
|
||||||
|
|
||||||
|
z = diffusion_to_vae_latents(z);
|
||||||
|
|
||||||
|
int64_t t0 = ggml_time_ms();
|
||||||
|
auto out = _compute(8, z, true);
|
||||||
|
int64_t t1 = ggml_time_ms();
|
||||||
|
|
||||||
|
GGML_ASSERT(!out.empty());
|
||||||
|
print_sd_tensor(out, false, "ltx_vae_out");
|
||||||
|
LOG_DEBUG("ltx vae test done in %lldms", t1 - t0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void load_from_file_and_test(const std::string& model_path,
|
||||||
|
const std::string& input_path) {
|
||||||
|
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||||
|
ggml_backend_t backend = ggml_backend_cpu_init();
|
||||||
|
LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
|
||||||
|
|
||||||
|
ModelLoader model_loader;
|
||||||
|
if (!model_loader.init_from_file_and_convert_name(model_path, "vae.")) {
|
||||||
|
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
||||||
|
std::shared_ptr<LTXVideoVAE> vae = std::make_shared<LTXVideoVAE>(backend,
|
||||||
|
false,
|
||||||
|
tensor_storage_map,
|
||||||
|
"first_stage_model",
|
||||||
|
true,
|
||||||
|
VERSION_LTXAV);
|
||||||
|
|
||||||
|
vae->alloc_params_buffer();
|
||||||
|
std::map<std::string, ggml_tensor*> tensors;
|
||||||
|
vae->get_param_tensors(tensors, "first_stage_model");
|
||||||
|
|
||||||
|
if (!model_loader.load_tensors(tensors)) {
|
||||||
|
LOG_ERROR("load tensors from model loader failed");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INFO("ltx vae model loaded");
|
||||||
|
vae->test(input_path);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __SD_LTX_VAE_HPP__
|
||||||
1921
src/ltxv.hpp
1921
src/ltxv.hpp
File diff suppressed because it is too large
Load Diff
@ -471,6 +471,9 @@ SDVersion ModelLoader::get_sd_version() {
|
|||||||
if (tensor_storage.name.find("model.diffusion_model.layers.0.adaLN_sa_ln.weight") != std::string::npos) {
|
if (tensor_storage.name.find("model.diffusion_model.layers.0.adaLN_sa_ln.weight") != std::string::npos) {
|
||||||
return VERSION_ERNIE_IMAGE;
|
return VERSION_ERNIE_IMAGE;
|
||||||
}
|
}
|
||||||
|
if (tensor_storage.name.find("model.diffusion_model.adaln_single.emb.timestep_embedder.linear_1.bias") != std::string::npos) {
|
||||||
|
return VERSION_LTXAV;
|
||||||
|
}
|
||||||
if (tensor_storage.name.find("model.diffusion_model.blocks.0.cross_attn.norm_k.weight") != std::string::npos) {
|
if (tensor_storage.name.find("model.diffusion_model.blocks.0.cross_attn.norm_k.weight") != std::string::npos) {
|
||||||
is_wan = true;
|
is_wan = true;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -42,6 +42,7 @@ enum SDVersion {
|
|||||||
VERSION_ANIMA,
|
VERSION_ANIMA,
|
||||||
VERSION_FLUX2,
|
VERSION_FLUX2,
|
||||||
VERSION_FLUX2_KLEIN,
|
VERSION_FLUX2_KLEIN,
|
||||||
|
VERSION_LTXAV,
|
||||||
VERSION_Z_IMAGE,
|
VERSION_Z_IMAGE,
|
||||||
VERSION_OVIS_IMAGE,
|
VERSION_OVIS_IMAGE,
|
||||||
VERSION_ERNIE_IMAGE,
|
VERSION_ERNIE_IMAGE,
|
||||||
@ -104,6 +105,13 @@ static inline bool sd_version_is_flux2(SDVersion version) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool sd_version_is_ltxav(SDVersion version) {
|
||||||
|
if (version == VERSION_LTXAV) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool sd_version_is_wan(SDVersion version) {
|
static inline bool sd_version_is_wan(SDVersion version) {
|
||||||
if (version == VERSION_WAN2 || version == VERSION_WAN2_2_I2V || version == VERSION_WAN2_2_TI2V) {
|
if (version == VERSION_WAN2 || version == VERSION_WAN2_2_I2V || version == VERSION_WAN2_2_TI2V) {
|
||||||
return true;
|
return true;
|
||||||
@ -160,6 +168,7 @@ static inline bool sd_version_is_inpaint(SDVersion version) {
|
|||||||
static inline bool sd_version_is_dit(SDVersion version) {
|
static inline bool sd_version_is_dit(SDVersion version) {
|
||||||
if (sd_version_is_flux(version) ||
|
if (sd_version_is_flux(version) ||
|
||||||
sd_version_is_flux2(version) ||
|
sd_version_is_flux2(version) ||
|
||||||
|
sd_version_is_ltxav(version) ||
|
||||||
sd_version_is_sd3(version) ||
|
sd_version_is_sd3(version) ||
|
||||||
sd_version_is_wan(version) ||
|
sd_version_is_wan(version) ||
|
||||||
sd_version_is_qwen_image(version) ||
|
sd_version_is_qwen_image(version) ||
|
||||||
|
|||||||
@ -14,6 +14,7 @@
|
|||||||
#include "diffusion_model.hpp"
|
#include "diffusion_model.hpp"
|
||||||
#include "esrgan.hpp"
|
#include "esrgan.hpp"
|
||||||
#include "lora.hpp"
|
#include "lora.hpp"
|
||||||
|
#include "ltx_vae.hpp"
|
||||||
#include "pmid.hpp"
|
#include "pmid.hpp"
|
||||||
#include "sample-cache.h"
|
#include "sample-cache.h"
|
||||||
#include "tae.hpp"
|
#include "tae.hpp"
|
||||||
@ -52,6 +53,7 @@ const char* model_version_to_str[] = {
|
|||||||
"Anima",
|
"Anima",
|
||||||
"Flux.2",
|
"Flux.2",
|
||||||
"Flux.2 klein",
|
"Flux.2 klein",
|
||||||
|
"LTXAV",
|
||||||
"Z-Image",
|
"Z-Image",
|
||||||
"Ovis Image",
|
"Ovis Image",
|
||||||
"Ernie Image",
|
"Ernie Image",
|
||||||
@ -351,6 +353,17 @@ public:
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (strlen(SAFE_STR(sd_ctx_params->embeddings_connectors_path)) > 0) {
|
||||||
|
if (sd_version_is_ltxav(version)) {
|
||||||
|
LOG_INFO("loading embeddings connectors from '%s'", sd_ctx_params->embeddings_connectors_path);
|
||||||
|
if (!model_loader.init_from_file(sd_ctx_params->embeddings_connectors_path)) {
|
||||||
|
LOG_WARN("loading embeddings connectors from '%s' failed", sd_ctx_params->embeddings_connectors_path);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOG_WARN("ignoring embeddings connectors for non-LTXAV model: '%s'", sd_ctx_params->embeddings_connectors_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
||||||
|
|
||||||
LOG_INFO("Version: %s ", model_version_to_str[version]);
|
LOG_INFO("Version: %s ", model_version_to_str[version]);
|
||||||
@ -415,6 +428,9 @@ public:
|
|||||||
// Might need vae encode for control cond
|
// Might need vae encode for control cond
|
||||||
vae_decode_only = false;
|
vae_decode_only = false;
|
||||||
}
|
}
|
||||||
|
if (sd_version_is_ltxav(version)) {
|
||||||
|
vae_decode_only = true;
|
||||||
|
}
|
||||||
|
|
||||||
bool tae_preview_only = sd_ctx_params->tae_preview_only;
|
bool tae_preview_only = sd_ctx_params->tae_preview_only;
|
||||||
if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
|
if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
|
||||||
@ -492,6 +508,14 @@ public:
|
|||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
version,
|
version,
|
||||||
sd_ctx_params->chroma_use_dit_mask);
|
sd_ctx_params->chroma_use_dit_mask);
|
||||||
|
} else if (sd_version_is_ltxav(version)) {
|
||||||
|
cond_stage_model = std::make_shared<LTXAVEmbedder>(clip_backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
|
tensor_storage_map);
|
||||||
|
diffusion_model = std::make_shared<LTXAVModel>(backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
|
tensor_storage_map,
|
||||||
|
"model.diffusion_model");
|
||||||
} else if (sd_version_is_wan(version)) {
|
} else if (sd_version_is_wan(version)) {
|
||||||
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
|
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
|
||||||
offload_params_to_cpu,
|
offload_params_to_cpu,
|
||||||
@ -638,9 +662,16 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
auto create_vae = [&]() -> std::shared_ptr<VAE> {
|
auto create_vae = [&]() -> std::shared_ptr<VAE> {
|
||||||
if (sd_version_is_wan(version) ||
|
if (sd_version_is_ltxav(version)) {
|
||||||
sd_version_is_qwen_image(version) ||
|
return std::make_shared<LTXVideoVAE>(vae_backend,
|
||||||
sd_version_is_anima(version)) {
|
offload_params_to_cpu,
|
||||||
|
tensor_storage_map,
|
||||||
|
"first_stage_model",
|
||||||
|
true,
|
||||||
|
version);
|
||||||
|
} else if (sd_version_is_wan(version) ||
|
||||||
|
sd_version_is_qwen_image(version) ||
|
||||||
|
sd_version_is_anima(version)) {
|
||||||
return std::make_shared<WAN::WanVAERunner>(vae_backend,
|
return std::make_shared<WAN::WanVAERunner>(vae_backend,
|
||||||
offload_params_to_cpu,
|
offload_params_to_cpu,
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
@ -936,13 +967,16 @@ public:
|
|||||||
pred_type = EPS_PRED;
|
pred_type = EPS_PRED;
|
||||||
}
|
}
|
||||||
} else if (sd_version_is_sd3(version) ||
|
} else if (sd_version_is_sd3(version) ||
|
||||||
|
sd_version_is_ltxav(version) ||
|
||||||
sd_version_is_wan(version) ||
|
sd_version_is_wan(version) ||
|
||||||
sd_version_is_qwen_image(version) ||
|
sd_version_is_qwen_image(version) ||
|
||||||
sd_version_is_anima(version) ||
|
sd_version_is_anima(version) ||
|
||||||
sd_version_is_ernie_image(version) ||
|
sd_version_is_ernie_image(version) ||
|
||||||
sd_version_is_z_image(version)) {
|
sd_version_is_z_image(version)) {
|
||||||
pred_type = FLOW_PRED;
|
pred_type = FLOW_PRED;
|
||||||
if (sd_version_is_wan(version)) {
|
if (sd_version_is_ltxav(version)) {
|
||||||
|
default_flow_shift = 2.37f;
|
||||||
|
} else if (sd_version_is_wan(version)) {
|
||||||
default_flow_shift = 5.f;
|
default_flow_shift = 5.f;
|
||||||
} else if (sd_version_is_ernie_image(version)) {
|
} else if (sd_version_is_ernie_image(version)) {
|
||||||
default_flow_shift = 4.f;
|
default_flow_shift = 4.f;
|
||||||
@ -979,8 +1013,13 @@ public:
|
|||||||
denoiser = std::make_shared<EDMVDenoiser>();
|
denoiser = std::make_shared<EDMVDenoiser>();
|
||||||
break;
|
break;
|
||||||
case FLOW_PRED: {
|
case FLOW_PRED: {
|
||||||
LOG_INFO("running in FLOW mode");
|
if (sd_version_is_ltxav(version)) {
|
||||||
denoiser = std::make_shared<DiscreteFlowDenoiser>();
|
LOG_INFO("running in LTXAV FLOW mode");
|
||||||
|
denoiser = std::make_shared<FluxFlowDenoiser>();
|
||||||
|
} else {
|
||||||
|
LOG_INFO("running in FLOW mode");
|
||||||
|
denoiser = std::make_shared<DiscreteFlowDenoiser>();
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case FLUX_FLOW_PRED: {
|
case FLUX_FLOW_PRED: {
|
||||||
@ -1621,6 +1660,7 @@ public:
|
|||||||
const sd::Tensor<float>& denoise_mask,
|
const sd::Tensor<float>& denoise_mask,
|
||||||
const sd::Tensor<float>& vace_context,
|
const sd::Tensor<float>& vace_context,
|
||||||
float vace_strength,
|
float vace_strength,
|
||||||
|
int audio_length,
|
||||||
const sd_cache_params_t* cache_params) {
|
const sd_cache_params_t* cache_params) {
|
||||||
std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
|
std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
|
||||||
float cfg_scale = guidance.txt_cfg;
|
float cfg_scale = guidance.txt_cfg;
|
||||||
@ -1699,6 +1739,7 @@ public:
|
|||||||
diffusion_params.control_strength = control_strength;
|
diffusion_params.control_strength = control_strength;
|
||||||
diffusion_params.vace_context = vace_context.empty() ? nullptr : &vace_context;
|
diffusion_params.vace_context = vace_context.empty() ? nullptr : &vace_context;
|
||||||
diffusion_params.vace_strength = vace_strength;
|
diffusion_params.vace_strength = vace_strength;
|
||||||
|
diffusion_params.audio_length = audio_length;
|
||||||
diffusion_params.skip_layers = nullptr;
|
diffusion_params.skip_layers = nullptr;
|
||||||
|
|
||||||
compute_sample_controls(control_image,
|
compute_sample_controls(control_image,
|
||||||
@ -1860,7 +1901,9 @@ public:
|
|||||||
int get_latent_channel() {
|
int get_latent_channel() {
|
||||||
int latent_channel = 4;
|
int latent_channel = 4;
|
||||||
if (sd_version_is_dit(version)) {
|
if (sd_version_is_dit(version)) {
|
||||||
if (version == VERSION_WAN2_2_TI2V) {
|
if (sd_version_is_ltxav(version)) {
|
||||||
|
latent_channel = 128;
|
||||||
|
} else if (version == VERSION_WAN2_2_TI2V) {
|
||||||
latent_channel = 48;
|
latent_channel = 48;
|
||||||
} else if (version == VERSION_CHROMA_RADIANCE) {
|
} else if (version == VERSION_CHROMA_RADIANCE) {
|
||||||
latent_channel = 3;
|
latent_channel = 3;
|
||||||
@ -1886,7 +1929,9 @@ public:
|
|||||||
int W = width / vae_scale_factor;
|
int W = width / vae_scale_factor;
|
||||||
int H = height / vae_scale_factor;
|
int H = height / vae_scale_factor;
|
||||||
int T = frames;
|
int T = frames;
|
||||||
if (sd_version_is_wan(version)) {
|
if (sd_version_is_ltxav(version)) {
|
||||||
|
T = ((T - 1) / 8) + 1;
|
||||||
|
} else if (sd_version_is_wan(version)) {
|
||||||
T = ((T - 1) / 4) + 1;
|
T = ((T - 1) / 4) + 1;
|
||||||
}
|
}
|
||||||
int C = get_latent_channel();
|
int C = get_latent_channel();
|
||||||
@ -2223,6 +2268,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
"llm_vision_path: %s\n"
|
"llm_vision_path: %s\n"
|
||||||
"diffusion_model_path: %s\n"
|
"diffusion_model_path: %s\n"
|
||||||
"high_noise_diffusion_model_path: %s\n"
|
"high_noise_diffusion_model_path: %s\n"
|
||||||
|
"embeddings_connectors_path: %s\n"
|
||||||
"vae_path: %s\n"
|
"vae_path: %s\n"
|
||||||
"taesd_path: %s\n"
|
"taesd_path: %s\n"
|
||||||
"control_net_path: %s\n"
|
"control_net_path: %s\n"
|
||||||
@ -2255,6 +2301,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
SAFE_STR(sd_ctx_params->llm_vision_path),
|
SAFE_STR(sd_ctx_params->llm_vision_path),
|
||||||
SAFE_STR(sd_ctx_params->diffusion_model_path),
|
SAFE_STR(sd_ctx_params->diffusion_model_path),
|
||||||
SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
|
SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
|
||||||
|
SAFE_STR(sd_ctx_params->embeddings_connectors_path),
|
||||||
SAFE_STR(sd_ctx_params->vae_path),
|
SAFE_STR(sd_ctx_params->vae_path),
|
||||||
SAFE_STR(sd_ctx_params->taesd_path),
|
SAFE_STR(sd_ctx_params->taesd_path),
|
||||||
SAFE_STR(sd_ctx_params->control_net_path),
|
SAFE_STR(sd_ctx_params->control_net_path),
|
||||||
@ -2433,6 +2480,7 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
|
|||||||
sd_vid_gen_params->strength = 0.75f;
|
sd_vid_gen_params->strength = 0.75f;
|
||||||
sd_vid_gen_params->seed = -1;
|
sd_vid_gen_params->seed = -1;
|
||||||
sd_vid_gen_params->video_frames = 6;
|
sd_vid_gen_params->video_frames = 6;
|
||||||
|
sd_vid_gen_params->fps = 16;
|
||||||
sd_vid_gen_params->moe_boundary = 0.875f;
|
sd_vid_gen_params->moe_boundary = 0.875f;
|
||||||
sd_vid_gen_params->vace_strength = 1.f;
|
sd_vid_gen_params->vace_strength = 1.f;
|
||||||
sd_vid_gen_params->vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
|
sd_vid_gen_params->vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
|
||||||
@ -2444,7 +2492,7 @@ struct sd_ctx_t {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static bool sd_version_supports_video_generation(SDVersion version) {
|
static bool sd_version_supports_video_generation(SDVersion version) {
|
||||||
return version == VERSION_SVD || sd_version_is_wan(version);
|
return version == VERSION_SVD || sd_version_is_wan(version) || sd_version_is_ltxav(version);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool sd_version_supports_image_generation(SDVersion version) {
|
static bool sd_version_supports_image_generation(SDVersion version) {
|
||||||
@ -2589,6 +2637,8 @@ struct GenerationRequest {
|
|||||||
sd_pm_params_t pm_params = {};
|
sd_pm_params_t pm_params = {};
|
||||||
sd_hires_params_t hires = {};
|
sd_hires_params_t hires = {};
|
||||||
int frames = -1;
|
int frames = -1;
|
||||||
|
int requested_frames = -1;
|
||||||
|
int fps = 16;
|
||||||
float vace_strength = 1.f;
|
float vace_strength = 1.f;
|
||||||
|
|
||||||
GenerationRequest(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
|
GenerationRequest(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
|
||||||
@ -2615,12 +2665,18 @@ struct GenerationRequest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
GenerationRequest(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) {
|
GenerationRequest(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) {
|
||||||
prompt = SAFE_STR(sd_vid_gen_params->prompt);
|
prompt = SAFE_STR(sd_vid_gen_params->prompt);
|
||||||
negative_prompt = SAFE_STR(sd_vid_gen_params->negative_prompt);
|
negative_prompt = SAFE_STR(sd_vid_gen_params->negative_prompt);
|
||||||
width = sd_vid_gen_params->width;
|
width = sd_vid_gen_params->width;
|
||||||
height = sd_vid_gen_params->height;
|
height = sd_vid_gen_params->height;
|
||||||
frames = (sd_vid_gen_params->video_frames - 1) / 4 * 4 + 1;
|
requested_frames = std::max(1, sd_vid_gen_params->video_frames);
|
||||||
|
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
||||||
|
frames = ((requested_frames - 1 + 7) / 8) * 8 + 1;
|
||||||
|
} else {
|
||||||
|
frames = (requested_frames - 1) / 4 * 4 + 1;
|
||||||
|
}
|
||||||
clip_skip = sd_vid_gen_params->clip_skip;
|
clip_skip = sd_vid_gen_params->clip_skip;
|
||||||
|
fps = std::max(1, sd_vid_gen_params->fps);
|
||||||
vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
|
vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
|
||||||
diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
|
diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
|
||||||
seed = sd_vid_gen_params->seed;
|
seed = sd_vid_gen_params->seed;
|
||||||
@ -2629,6 +2685,12 @@ struct GenerationRequest {
|
|||||||
guidance = sd_vid_gen_params->sample_params.guidance;
|
guidance = sd_vid_gen_params->sample_params.guidance;
|
||||||
high_noise_guidance = sd_vid_gen_params->high_noise_sample_params.guidance;
|
high_noise_guidance = sd_vid_gen_params->high_noise_sample_params.guidance;
|
||||||
resolve(sd_ctx);
|
resolve(sd_ctx);
|
||||||
|
if (frames != requested_frames) {
|
||||||
|
LOG_WARN("align video frames from %d to %d for %s",
|
||||||
|
requested_frames,
|
||||||
|
frames,
|
||||||
|
model_version_to_str[sd_ctx->sd->version]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void align_generation_request_size() {
|
void align_generation_request_size() {
|
||||||
@ -2858,6 +2920,7 @@ struct ImageGenerationLatents {
|
|||||||
sd::Tensor<float> init_latent;
|
sd::Tensor<float> init_latent;
|
||||||
sd::Tensor<float> concat_latent;
|
sd::Tensor<float> concat_latent;
|
||||||
sd::Tensor<float> uncond_concat_latent;
|
sd::Tensor<float> uncond_concat_latent;
|
||||||
|
sd::Tensor<float> audio_latent;
|
||||||
sd::Tensor<float> control_image;
|
sd::Tensor<float> control_image;
|
||||||
std::vector<sd::Tensor<float>> ref_images;
|
std::vector<sd::Tensor<float>> ref_images;
|
||||||
std::vector<sd::Tensor<float>> ref_latents;
|
std::vector<sd::Tensor<float>> ref_latents;
|
||||||
@ -2865,8 +2928,51 @@ struct ImageGenerationLatents {
|
|||||||
sd::Tensor<float> clip_vision_output;
|
sd::Tensor<float> clip_vision_output;
|
||||||
sd::Tensor<float> vace_context;
|
sd::Tensor<float> vace_context;
|
||||||
int64_t ref_image_num = 0;
|
int64_t ref_image_num = 0;
|
||||||
|
int audio_length = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static sd::Tensor<float> pack_ltxav_audio_and_video_latents(const sd::Tensor<float>& video_latent,
|
||||||
|
const sd::Tensor<float>& audio_latent) {
|
||||||
|
if (audio_latent.empty()) {
|
||||||
|
return video_latent;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(video_latent.dim() == 4 || video_latent.dim() == 5);
|
||||||
|
GGML_ASSERT(audio_latent.dim() == 3 || audio_latent.dim() == 4);
|
||||||
|
if (video_latent.dim() == 5) {
|
||||||
|
GGML_ASSERT(video_latent.shape()[4] == 1);
|
||||||
|
}
|
||||||
|
if (audio_latent.dim() == 4) {
|
||||||
|
GGML_ASSERT(audio_latent.shape()[3] == 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t width = video_latent.shape()[0];
|
||||||
|
int64_t height = video_latent.shape()[1];
|
||||||
|
int64_t frames = video_latent.shape()[2];
|
||||||
|
int64_t video_ch = video_latent.shape()[3];
|
||||||
|
int64_t spatial_size = width * height * frames;
|
||||||
|
int64_t audio_values = audio_latent.numel();
|
||||||
|
int64_t extra_ch = (audio_values + spatial_size - 1) / spatial_size;
|
||||||
|
|
||||||
|
std::vector<int64_t> packed_shape = video_latent.shape();
|
||||||
|
packed_shape[3] = video_ch + extra_ch;
|
||||||
|
sd::Tensor<float> packed = sd::zeros<float>(packed_shape);
|
||||||
|
|
||||||
|
std::copy_n(video_latent.data(), video_latent.numel(), packed.data());
|
||||||
|
std::copy_n(audio_latent.data(), audio_latent.numel(), packed.data() + video_latent.numel());
|
||||||
|
return packed;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int get_ltxav_num_audio_latents(int frames, int fps) {
|
||||||
|
GGML_ASSERT(frames > 0);
|
||||||
|
GGML_ASSERT(fps > 0);
|
||||||
|
constexpr float kSampleRate = 16000.0f;
|
||||||
|
constexpr float kMelHopLength = 160.0f;
|
||||||
|
constexpr float kAudioLatentDownsample = 4.0f;
|
||||||
|
constexpr float kLatentsPerSecond = kSampleRate / kMelHopLength / kAudioLatentDownsample;
|
||||||
|
return static_cast<int>(std::ceil((static_cast<float>(frames) / static_cast<float>(fps)) * kLatentsPerSecond));
|
||||||
|
}
|
||||||
|
|
||||||
struct ImageGenerationEmbeds {
|
struct ImageGenerationEmbeds {
|
||||||
SDCondition cond;
|
SDCondition cond;
|
||||||
SDCondition uncond;
|
SDCondition uncond;
|
||||||
@ -3454,6 +3560,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
|
|||||||
latents.denoise_mask,
|
latents.denoise_mask,
|
||||||
sd::Tensor<float>(),
|
sd::Tensor<float>(),
|
||||||
1.f,
|
1.f,
|
||||||
|
0,
|
||||||
request.cache_params);
|
request.cache_params);
|
||||||
int64_t sampling_end = ggml_time_ms();
|
int64_t sampling_end = ggml_time_ms();
|
||||||
if (!x_0.empty()) {
|
if (!x_0.empty()) {
|
||||||
@ -3575,6 +3682,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
|
|||||||
hires_denoise_mask,
|
hires_denoise_mask,
|
||||||
sd::Tensor<float>(),
|
sd::Tensor<float>(),
|
||||||
1.f,
|
1.f,
|
||||||
|
0,
|
||||||
request.cache_params);
|
request.cache_params);
|
||||||
int64_t hires_sample_end = ggml_time_ms();
|
int64_t hires_sample_end = ggml_time_ms();
|
||||||
if (!x_0.empty()) {
|
if (!x_0.empty()) {
|
||||||
@ -3633,6 +3741,18 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
|
|||||||
end_image = sd_image_to_tensor(sd_vid_gen_params->end_image, request->width, request->height);
|
end_image = sd_image_to_tensor(sd_vid_gen_params->end_image, request->width, request->height);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
||||||
|
latents.audio_length = 0;
|
||||||
|
latents.audio_latent = {};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
||||||
|
if (!start_image.empty() || !end_image.empty() || sd_vid_gen_params->control_frames_size > 0) {
|
||||||
|
LOG_ERROR("LTXAV currently supports txt2vid only; init_image, end_image, and control_frames are not implemented");
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
|
if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
|
||||||
sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-I2V-14B" ||
|
sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-I2V-14B" ||
|
||||||
sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" ||
|
sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" ||
|
||||||
@ -3803,6 +3923,9 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
|
|||||||
latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
|
latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pipeline-level audio support is temporarily disabled. Keep the model-side
|
||||||
|
// AV implementation intact, but feed pure video latents through vid_gen.
|
||||||
|
|
||||||
return latents;
|
return latents;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3839,14 +3962,26 @@ static ImageGenerationEmbeds prepare_video_generation_embeds(sd_ctx_t* sd_ctx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
|
static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
|
||||||
|
const GenerationRequest& request,
|
||||||
const sd::Tensor<float>& final_latent,
|
const sd::Tensor<float>& final_latent,
|
||||||
int* num_frames_out) {
|
int* num_frames_out) {
|
||||||
if (final_latent.empty()) {
|
if (final_latent.empty()) {
|
||||||
LOG_ERROR("no latent video to decode");
|
LOG_ERROR("no latent video to decode");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
sd::Tensor<float> video_latent = final_latent;
|
||||||
|
if (sd_version_is_ltxav(sd_ctx->sd->version) &&
|
||||||
|
video_latent.shape()[3] > sd_ctx->sd->get_latent_channel()) {
|
||||||
|
video_latent = sd::ops::slice(video_latent, 3, 0, sd_ctx->sd->get_latent_channel());
|
||||||
|
}
|
||||||
|
LOG_DEBUG("decode_video_outputs latent %dx%dx%dx%d",
|
||||||
|
(int)video_latent.shape()[0],
|
||||||
|
(int)video_latent.shape()[1],
|
||||||
|
(int)video_latent.shape()[2],
|
||||||
|
(int)video_latent.shape()[3]);
|
||||||
|
// auto z = sd::load_tensor_from_file_as_tensor<float>("ltx_vae_z.bin");
|
||||||
int64_t t4 = ggml_time_ms();
|
int64_t t4 = ggml_time_ms();
|
||||||
sd::Tensor<float> vid = sd_ctx->sd->decode_first_stage(final_latent, true);
|
sd::Tensor<float> vid = sd_ctx->sd->decode_first_stage(video_latent, true);
|
||||||
int64_t t5 = ggml_time_ms();
|
int64_t t5 = ggml_time_ms();
|
||||||
LOG_INFO("decode_first_stage completed, taking %.2fs", (t5 - t4) * 1.0f / 1000);
|
LOG_INFO("decode_first_stage completed, taking %.2fs", (t5 - t4) * 1.0f / 1000);
|
||||||
if (sd_ctx->sd->free_params_immediately) {
|
if (sd_ctx->sd->free_params_immediately) {
|
||||||
@ -3856,6 +3991,15 @@ static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
|
|||||||
LOG_ERROR("decode_first_stage failed for video");
|
LOG_ERROR("decode_first_stage failed for video");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
LOG_DEBUG("decode_video_outputs decoded %dx%dx%dx%d",
|
||||||
|
(int)vid.shape()[0],
|
||||||
|
(int)vid.shape()[1],
|
||||||
|
(int)vid.shape()[2],
|
||||||
|
(int)vid.shape()[3]);
|
||||||
|
if (request.requested_frames > 0 &&
|
||||||
|
vid.shape()[2] > request.requested_frames) {
|
||||||
|
vid = sd::ops::slice(vid, 2, 0, request.requested_frames);
|
||||||
|
}
|
||||||
|
|
||||||
sd_image_t* result_images = (sd_image_t*)calloc(vid.shape()[2], sizeof(sd_image_t));
|
sd_image_t* result_images = (sd_image_t*)calloc(vid.shape()[2], sizeof(sd_image_t));
|
||||||
if (result_images == nullptr) {
|
if (result_images == nullptr) {
|
||||||
@ -3939,6 +4083,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
|||||||
latents.denoise_mask,
|
latents.denoise_mask,
|
||||||
latents.vace_context,
|
latents.vace_context,
|
||||||
request.vace_strength,
|
request.vace_strength,
|
||||||
|
latents.audio_length,
|
||||||
request.cache_params);
|
request.cache_params);
|
||||||
int64_t sampling_end = ggml_time_ms();
|
int64_t sampling_end = ggml_time_ms();
|
||||||
if (x_t_sampled.empty()) {
|
if (x_t_sampled.empty()) {
|
||||||
@ -3981,6 +4126,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
|||||||
latents.denoise_mask,
|
latents.denoise_mask,
|
||||||
latents.vace_context,
|
latents.vace_context,
|
||||||
request.vace_strength,
|
request.vace_strength,
|
||||||
|
latents.audio_length,
|
||||||
request.cache_params);
|
request.cache_params);
|
||||||
|
|
||||||
int64_t sampling_end = ggml_time_ms();
|
int64_t sampling_end = ggml_time_ms();
|
||||||
@ -4000,7 +4146,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
|||||||
int64_t latent_end = ggml_time_ms();
|
int64_t latent_end = ggml_time_ms();
|
||||||
LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000);
|
LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000);
|
||||||
|
|
||||||
auto result = decode_video_outputs(sd_ctx, final_latent, num_frames_out);
|
auto result = decode_video_outputs(sd_ctx, request, final_latent, num_frames_out);
|
||||||
if (result == nullptr) {
|
if (result == nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
#define __TAE_HPP__
|
#define __TAE_HPP__
|
||||||
|
|
||||||
#include "ggml_extend.hpp"
|
#include "ggml_extend.hpp"
|
||||||
|
|
||||||
#include "model.h"
|
#include "model.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|||||||
@ -104,7 +104,7 @@ namespace sd {
|
|||||||
throw std::invalid_argument("tensor file type does not match requested sd::Tensor type");
|
throw std::invalid_argument("tensor file type does not match requested sd::Tensor type");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int64_t> shape(4, 1);
|
std::vector<int64_t> shape(n_dims, 1);
|
||||||
for (int i = 0; i < n_dims; ++i) {
|
for (int i = 0; i < n_dims; ++i) {
|
||||||
int32_t dim = 1;
|
int32_t dim = 1;
|
||||||
file.read(reinterpret_cast<char*>(&dim), sizeof(dim));
|
file.read(reinterpret_cast<char*>(&dim), sizeof(dim));
|
||||||
|
|||||||
@ -50,6 +50,7 @@ GemmaTokenizer::GemmaTokenizer(const std::string& merges_utf8_str, const std::st
|
|||||||
byte_level_bpe = false;
|
byte_level_bpe = false;
|
||||||
byte_fallback = true;
|
byte_fallback = true;
|
||||||
add_bos_token = true;
|
add_bos_token = true;
|
||||||
|
pad_left = true;
|
||||||
PAD_TOKEN = "<pad>";
|
PAD_TOKEN = "<pad>";
|
||||||
EOS_TOKEN = "<eos>";
|
EOS_TOKEN = "<eos>";
|
||||||
BOS_TOKEN = "<bos>";
|
BOS_TOKEN = "<bos>";
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@ -67,7 +67,9 @@ public:
|
|||||||
|
|
||||||
int get_scale_factor() {
|
int get_scale_factor() {
|
||||||
int scale_factor = 8;
|
int scale_factor = 8;
|
||||||
if (version == VERSION_WAN2_2_TI2V) {
|
if (version == VERSION_LTXAV) {
|
||||||
|
scale_factor = 32;
|
||||||
|
} else if (version == VERSION_WAN2_2_TI2V) {
|
||||||
scale_factor = 16;
|
scale_factor = 16;
|
||||||
} else if (sd_version_uses_flux2_vae(version)) {
|
} else if (sd_version_uses_flux2_vae(version)) {
|
||||||
scale_factor = 16;
|
scale_factor = 16;
|
||||||
|
|||||||
16
src/wan.hpp
16
src/wan.hpp
@ -966,10 +966,10 @@ namespace WAN {
|
|||||||
blocks["conv2"] = std::shared_ptr<GGMLBlock>(new CausalConv3d(z_dim, z_dim, {1, 1, 1}));
|
blocks["conv2"] = std::shared_ptr<GGMLBlock>(new CausalConv3d(z_dim, z_dim, {1, 1, 1}));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* patchify(ggml_context* ctx,
|
static ggml_tensor* patchify(ggml_context* ctx,
|
||||||
ggml_tensor* x,
|
ggml_tensor* x,
|
||||||
int64_t patch_size,
|
int64_t patch_size,
|
||||||
int64_t b = 1) {
|
int64_t b = 1) {
|
||||||
// x: [b*c, f, h*q, w*r]
|
// x: [b*c, f, h*q, w*r]
|
||||||
// return: [b*c*r*q, f, h, w]
|
// return: [b*c*r*q, f, h, w]
|
||||||
if (patch_size == 1) {
|
if (patch_size == 1) {
|
||||||
@ -993,10 +993,10 @@ namespace WAN {
|
|||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* unpatchify(ggml_context* ctx,
|
static ggml_tensor* unpatchify(ggml_context* ctx,
|
||||||
ggml_tensor* x,
|
ggml_tensor* x,
|
||||||
int64_t patch_size,
|
int64_t patch_size,
|
||||||
int64_t b = 1) {
|
int64_t b = 1) {
|
||||||
// x: [b*c*r*q, f, h, w]
|
// x: [b*c*r*q, f, h, w]
|
||||||
// return: [b*c, f, h*q, w*r]
|
// return: [b*c, f, h*q, w*r]
|
||||||
if (patch_size == 1) {
|
if (patch_size == 1) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user