Compare commits

..

1 Commits

Author SHA1 Message Date
leejet
ca7e008d78 wip 2026-04-27 21:43:22 +08:00
10 changed files with 326 additions and 702 deletions

View File

@ -19,6 +19,8 @@
#include "common/media_io.h"
#include "common/resource_owners.hpp"
#include "image_metadata.h"
#include "llm.hpp"
#include "ltx_vae_test.h"
namespace fs = std::filesystem;
@ -500,6 +502,27 @@ int main(int argc, const char* argv[]) {
SDContextParams ctx_params;
SDGenerationParams gen_params;
cli_params.verbose = true;
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
{
const bool run_ltx_vae_test = false;
const std::string model_path = "E:/Code/ComfyUI/models/vae/ltx-2.3-22b-dev_video_vae.safetensors";
const std::string input_path = "E:/Code/sd.cpp/build/ltx_vae_z.bin";
if (run_ltx_vae_test) {
ltx_vae_load_from_file_and_test(model_path, input_path);
return 0;
}
}
// cli_params.verbose = true;
// sd_set_log_callback(sd_log_cb, (void*)&cli_params);
// GemmaTokenizer tokenizer;
// auto tokens = tokenizer.tokenize("<html> 一只可爱的小猫");
// for (auto token : tokens) {
// LOG_INFO("%d", token);
// }
// return 0;
parse_args(argc, argv, cli_params, ctx_params, gen_params);
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
log_verbose = cli_params.verbose;

View File

@ -103,64 +103,6 @@ namespace DiT {
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
return x;
}
inline ggml_tensor* patchify(ggml_context* ctx,
ggml_tensor* x,
int pt,
int ph,
int pw,
int64_t N = 1) {
// x: [N*C, T, H, W]
// return: [N, h*w, C*pt*ph*pw]
int64_t C = x->ne[3] / N;
int64_t T = x->ne[2];
int64_t H = x->ne[1];
int64_t W = x->ne[0];
int64_t t_len = T / pt;
int64_t h_len = H / ph;
int64_t w_len = W / pw;
GGML_ASSERT(C * N == x->ne[3]);
GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N); // [N*C*t_len, pt, h_len*ph, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, h_len*ph, pt, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N); // [N*C*t_len*h_len, ph, pt, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt, ph, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt*ph, w_len, pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, w_len, pt*ph, pw]
x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N); // [N, C, t_len*h_len*w_len, pt*ph*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N, t_len*h_len*w_len, C, pt*ph*pw]
x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1); // [N, t_len*h_len*w_len, C*pt*ph*pw]
return x;
}
inline ggml_tensor* unpatchify(ggml_context* ctx,
ggml_tensor* x,
int64_t t_len,
int64_t h_len,
int64_t w_len,
int pt,
int ph,
int pw) {
// x: [N, t_len*h_len*w_len, pt*ph*pw*C]
// return: [N*C, t_len*pt, h_len*ph, w_len*pw]
int64_t N = x->ne[3];
int64_t C = x->ne[0] / pt / ph / pw;
GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N); // [N, t_len*h_len*w_len, pt*ph*pw, C]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, t_len*h_len*w_len, pt*ph*pw]
x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N); // [N*C*t_len*h_len, w_len, pt*ph, pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt*ph, w_len, pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt, ph, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, ph, pt, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N); // [N*C*t_len, h_len*ph, pt, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, pt, h_len*ph, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N); // [N*C, t_len*pt, h_len*ph, w_len*pw]
return x;
}
} // namespace DiT
#endif // __COMMON_DIT_HPP__

View File

@ -1682,15 +1682,6 @@ struct GGMLRunnerContext {
bool circular_x_enabled = false;
bool circular_y_enabled = false;
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
std::unordered_map<ggml_tensor*, std::string>* debug_tensors = nullptr;
void capture_tensor(const std::string& name, ggml_tensor* tensor) {
if (debug_tensors == nullptr || tensor == nullptr) {
return;
}
ggml_set_output(tensor);
(*debug_tensors)[tensor] = name;
}
};
struct GGMLRunner {
@ -1722,7 +1713,6 @@ protected:
std::map<ggml_tensor*, const void*> backend_tensor_data_map;
std::map<std::string, ggml_tensor*> cache_tensor_map; // name -> tensor
std::unordered_map<ggml_tensor*, std::string> debug_tensors;
const std::string final_result_name = "ggml_runner_final_result_tensor";
bool flash_attn_enabled = false;
@ -1809,7 +1799,6 @@ protected:
}
void free_compute_ctx() {
debug_tensors.clear();
if (compute_ctx != nullptr) {
ggml_free(compute_ctx);
compute_ctx = nullptr;
@ -1845,11 +1834,6 @@ protected:
auto result = ggml_graph_node(gf, -1);
ggml_set_name(result, final_result_name.c_str());
}
for (const auto& entry : debug_tensors) {
if (entry.first != nullptr) {
ggml_build_forward_expand(gf, entry.first);
}
}
prepare_build_in_tensor_after(gf);
return gf;
}
@ -1919,21 +1903,6 @@ protected:
for (auto& kv : backend_tensor_data_map) {
auto tensor = kv.first;
auto data = kv.second;
if (tensor == nullptr || data == nullptr) {
continue;
}
const char* name = ggml_get_name(tensor);
if (tensor->buffer == nullptr) {
LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
get_desc().c_str(),
name != nullptr ? name : "",
(long long)tensor->ne[0],
(long long)tensor->ne[1],
(long long)tensor->ne[2],
(long long)tensor->ne[3],
ggml_type_name(tensor->type));
continue;
}
ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
}
@ -2056,7 +2025,6 @@ public:
runner_ctx.circular_x_enabled = circular_x_enabled;
runner_ctx.circular_y_enabled = circular_y_enabled;
runner_ctx.weight_adapter = weight_adapter;
runner_ctx.debug_tensors = &debug_tensors;
return runner_ctx;
}
@ -2195,21 +2163,6 @@ public:
LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
return std::nullopt;
}
for (const auto& entry : debug_tensors) {
auto tensor = entry.first;
if (tensor == nullptr) {
continue;
}
if (tensor->type != GGML_TYPE_F32) {
LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s",
get_desc().c_str(),
entry.second.c_str(),
ggml_type_name(tensor->type));
continue;
}
auto debug_tensor = sd::make_sd_tensor_from_ggml<float>(tensor);
print_sd_tensor(debug_tensor, false, entry.second.c_str());
}
copy_cache_tensors_to_cache_buffer();
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
std::optional<sd::Tensor<T>> output;

View File

@ -1,5 +1,5 @@
#ifndef __SD_LTX_VAE_HPP__
#define __SD_LTX_VAE_HPP__
#ifndef __SD_LTX_VAE_H__
#define __SD_LTX_VAE_H__
#include <fstream>
#include <memory>
@ -936,8 +936,7 @@ struct LTXVideoVAE : public VAE {
static void load_from_file_and_test(const std::string& model_path,
const std::string& input_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = ggml_backend_cuda_init(0);
LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
ModelLoader model_loader;
@ -968,4 +967,4 @@ struct LTXVideoVAE : public VAE {
}
};
#endif // __SD_LTX_VAE_HPP__
#endif // __SD_LTX_VAE_H__

8
src/ltx_vae_test.cpp Normal file
View File

@ -0,0 +1,8 @@
#include "ltx_vae_test.h"
#include "ltx_vae.h"
void ltx_vae_load_from_file_and_test(const std::string& model_path,
const std::string& input_path) {
LTXVideoVAE::load_from_file_and_test(model_path, input_path);
}

9
src/ltx_vae_test.h Normal file
View File

@ -0,0 +1,9 @@
#ifndef __SD_LTX_VAE_TEST_H__
#define __SD_LTX_VAE_TEST_H__
#include <string>
void ltx_vae_load_from_file_and_test(const std::string& model_path,
const std::string& input_path);
#endif // __SD_LTX_VAE_TEST_H__

View File

@ -1,5 +1,5 @@
#ifndef __SD_LTXV_HPP__
#define __SD_LTXV_HPP__
#ifndef __LTXV_HPP__
#define __LTXV_HPP__
#include <algorithm>
#include <cmath>
@ -79,30 +79,6 @@ namespace LTXV {
return out;
}
__STATIC_INLINE__ std::vector<double> generate_freq_grid_double(double theta,
int positional_dims,
int dim) {
const int n_elem = 2 * positional_dims;
const int freq_count = dim / n_elem;
std::vector<double> out(freq_count);
if (freq_count <= 0) {
return out;
}
if (freq_count == 1) {
out[0] = 1.5707963267948966;
return out;
}
const double half_pi = 1.5707963267948966;
const double log_theta = std::log(theta);
for (int i = 0; i < freq_count; i++) {
double ratio = static_cast<double>(i) / static_cast<double>(freq_count - 1);
out[i] = std::exp(log_theta * ratio) * half_pi;
}
return out;
}
__STATIC_INLINE__ std::vector<float> build_rope_matrix_from_frequencies(
const std::vector<std::vector<float>>& frequencies,
int dim) {
@ -126,43 +102,16 @@ namespace LTXV {
return out;
}
__STATIC_INLINE__ std::vector<std::vector<float>> split_frequencies_by_heads(
const std::vector<std::vector<float>>& frequencies,
int inner_dim,
int num_heads) {
GGML_ASSERT(num_heads > 0);
GGML_ASSERT(inner_dim % num_heads == 0);
const int inner_half_dim = inner_dim / 2;
const int per_head_half_dim = inner_half_dim / num_heads;
GGML_ASSERT(inner_half_dim % num_heads == 0);
std::vector<std::vector<float>> out(
frequencies.size() * static_cast<size_t>(num_heads),
std::vector<float>(per_head_half_dim, 0.f));
for (size_t token = 0; token < frequencies.size(); token++) {
GGML_ASSERT(static_cast<int>(frequencies[token].size()) == inner_half_dim);
for (int head = 0; head < num_heads; head++) {
auto& dst = out[token * static_cast<size_t>(num_heads) + static_cast<size_t>(head)];
std::copy_n(frequencies[token].begin() + head * per_head_half_dim, per_head_half_dim, dst.begin());
}
}
return out;
}
__STATIC_INLINE__ std::vector<float> build_video_rope_matrix(int64_t width,
int64_t height,
int64_t frames,
int dim,
int num_heads = 1,
float frame_rate = 25.f,
float theta = 10000.f,
const std::vector<int>& max_pos = {20, 2048, 2048},
const std::tuple<int, int, int>& vae_scale_factors = {8, 32, 32},
bool causal_temporal_positioning = false,
bool use_middle_indices_grid = false) {
bool causal_temporal_positioning = false) {
GGML_ASSERT(max_pos.size() == 3);
GGML_ASSERT(dim % num_heads == 0);
const std::vector<float> indices = generate_freq_grid(theta, 3, dim);
const int half_dim = dim / 2;
const int pad_size = half_dim - static_cast<int>(indices.size()) * 3;
@ -180,25 +129,11 @@ namespace LTXV {
pixel_t = std::max(0.f, pixel_t + 1.f - scale_t);
}
pixel_t /= frame_rate;
if (use_middle_indices_grid) {
float end = static_cast<float>((t + 1) * scale_t);
if (causal_temporal_positioning) {
end = std::max(0.f, end + 1.f - scale_t);
}
end /= frame_rate;
pixel_t = 0.5f * (pixel_t + end);
}
for (int64_t h = 0; h < height; h++) {
float pixel_h = static_cast<float>(h * scale_h);
if (use_middle_indices_grid) {
pixel_h += 0.5f * static_cast<float>(scale_h);
}
for (int64_t w = 0; w < width; w++) {
float pixel_w = static_cast<float>(w * scale_w);
if (use_middle_indices_grid) {
pixel_w += 0.5f * static_cast<float>(scale_w);
}
int out_idx = 0;
for (int i = 0; i < pad_size; i++) {
@ -211,6 +146,13 @@ namespace LTXV {
pixel_w / max_pos[2],
};
// Match ComfyUI generate_freqs():
// (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
// .transpose(-1, -2)
// .flatten(2)
// After the transpose, the half-dim order is:
// [t_f0, h_f0, w_f0, t_f1, h_f1, w_f1, ...]
// not [t_f0, t_f1, ..., h_f0, h_f1, ..., w_f0, w_f1, ...].
for (float index : indices) {
for (int axis = 0; axis < 3; axis++) {
freqs[token][out_idx++] = index * (coords[axis] * 2.f - 1.f);
@ -221,24 +163,16 @@ namespace LTXV {
}
}
if (num_heads > 1) {
return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
}
return build_rope_matrix_from_frequencies(freqs, dim);
}
__STATIC_INLINE__ std::vector<float> build_1d_rope_matrix(int64_t seq_len,
int dim,
int num_heads = 1,
float theta = 10000.f,
float positional_scale = 4096.f,
bool double_precision = false) {
GGML_ASSERT(dim % num_heads == 0);
const std::vector<float> indices = double_precision ? std::vector<float>() : generate_freq_grid(theta, 1, dim);
const std::vector<double> indices_d =
double_precision ? generate_freq_grid_double(static_cast<double>(theta), 1, dim) : std::vector<double>();
float positional_scale = 4096.f) {
const std::vector<float> indices = generate_freq_grid(theta, 1, dim);
const int half_dim = dim / 2;
const int pad_size = half_dim - static_cast<int>(double_precision ? indices_d.size() : indices.size());
const int pad_size = half_dim - static_cast<int>(indices.size());
std::vector<std::vector<float>> freqs(static_cast<size_t>(seq_len), std::vector<float>(half_dim, 0.f));
for (int64_t pos = 0; pos < seq_len; pos++) {
@ -247,39 +181,20 @@ namespace LTXV {
freqs[static_cast<size_t>(pos)][out_idx++] = 0.f;
}
if (double_precision) {
double coord = static_cast<double>(pos) / static_cast<double>(positional_scale);
for (double index : indices_d) {
freqs[static_cast<size_t>(pos)][out_idx++] = static_cast<float>(index * (coord * 2.0 - 1.0));
}
} else {
float coord = static_cast<float>(pos) / positional_scale;
for (float index : indices) {
freqs[static_cast<size_t>(pos)][out_idx++] = index * (coord * 2.f - 1.f);
}
}
}
if (num_heads > 1) {
return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
}
return build_rope_matrix_from_frequencies(freqs, dim);
}
__STATIC_INLINE__ ggml_tensor* apply_hidden_rope(ggml_context* ctx,
ggml_tensor* x,
ggml_tensor* pe,
int64_t heads,
int64_t dim_head,
bool rope_interleaved) {
GGML_ASSERT(x->ne[0] == heads * dim_head);
auto x4 = ggml_reshape_4d(ctx, x, dim_head, heads, x->ne[1], x->ne[2]);
if (pe != nullptr && pe->ne[3] == x->ne[1] * heads) {
auto x_flat = ggml_reshape_4d(ctx, x4, dim_head, 1, x->ne[1] * heads, x->ne[2]);
auto out_flat = Rope::apply_rope(ctx, x_flat, pe, rope_interleaved);
auto out4 = ggml_reshape_4d(ctx, out_flat, dim_head, heads, x->ne[1], x->ne[2]);
return ggml_reshape_3d(ctx, out4, heads * dim_head, x->ne[1], x->ne[2]);
}
auto x4 = ggml_reshape_4d(ctx, x, x->ne[0], 1, x->ne[1], x->ne[2]);
return Rope::apply_rope(ctx, x4, pe, rope_interleaved);
}
@ -423,8 +338,8 @@ namespace LTXV {
if (k_pe == nullptr) {
k_pe = pe;
}
q = apply_hidden_rope(ctx->ggml_ctx, q, pe, heads, dim_head, rope_interleaved);
k = apply_hidden_rope(ctx->ggml_ctx, k, k_pe, heads, dim_head, rope_interleaved);
q = apply_hidden_rope(ctx->ggml_ctx, q, pe, rope_interleaved);
k = apply_hidden_rope(ctx->ggml_ctx, k, k_pe, rope_interleaved);
}
auto out = ggml_ext_attention_ext(ctx->ggml_ctx,
@ -500,7 +415,7 @@ namespace LTXV {
s = ggml_repeat(ctx->ggml_ctx, s, e);
t = ggml_repeat(ctx->ggml_ctx, t, e);
auto out = ggml_add(ctx->ggml_ctx, s, t);
return ggml_ext_chunk(ctx->ggml_ctx, out, static_cast<int>(coeff), 1);
return ggml_ext_chunk(ctx->ggml_ctx, out, coeff, 1);
}
std::vector<ggml_tensor*> get_prompt_scale_shift_values(GGMLRunnerContext* ctx,
@ -694,7 +609,7 @@ namespace LTXV {
float positional_embedding_theta = 10000.f;
std::vector<int> positional_embedding_max_pos = {20, 2048, 2048};
std::tuple<int, int, int> vae_scale_factors = {8, 32, 32};
bool causal_temporal_positioning = true;
bool causal_temporal_positioning = false;
float timestep_scale_multiplier = 1000.f;
int64_t audio_in_channels = 128;
@ -726,14 +641,11 @@ namespace LTXV {
bool audio_connector_rope_interleaved = false;
bool audio_connector_apply_gated_attention = false;
bool video_rope_interleaved = false;
bool use_middle_indices_grid = true;
bool video_rope_interleaved = true;
bool cross_attention_adaln = false;
bool use_caption_projection = true;
bool use_audio_caption_projection = true;
bool caption_proj_before_connector = true;
bool caption_projection_first_linear = false;
bool self_attention_gated = false;
bool cross_attention_gated = false;
@ -758,16 +670,11 @@ namespace LTXV {
__STATIC_INLINE__ std::vector<float> build_1d_rope_matrix_from_coords(const std::vector<float>& coords,
int dim,
int num_heads = 1,
float theta = 10000.f,
float max_pos = 20.f,
bool double_precision = false) {
GGML_ASSERT(dim % num_heads == 0);
const std::vector<float> indices = double_precision ? std::vector<float>() : generate_freq_grid(theta, 1, dim);
const std::vector<double> indices_d =
double_precision ? generate_freq_grid_double(static_cast<double>(theta), 1, dim) : std::vector<double>();
float max_pos = 20.f) {
const std::vector<float> indices = generate_freq_grid(theta, 1, dim);
const int half_dim = dim / 2;
const int pad_size = half_dim - static_cast<int>(double_precision ? indices_d.size() : indices.size());
const int pad_size = half_dim - static_cast<int>(indices.size());
std::vector<std::vector<float>> freqs(coords.size(), std::vector<float>(half_dim, 0.f));
for (size_t pos = 0; pos < coords.size(); pos++) {
@ -775,21 +682,11 @@ namespace LTXV {
for (int i = 0; i < pad_size; i++) {
freqs[pos][out_idx++] = 0.f;
}
if (double_precision) {
double coord = static_cast<double>(coords[pos]) / static_cast<double>(max_pos);
for (double index : indices_d) {
freqs[pos][out_idx++] = static_cast<float>(index * (coord * 2.0 - 1.0));
}
} else {
float coord = coords[pos] / max_pos;
for (float index : indices) {
freqs[pos][out_idx++] = index * (coord * 2.f - 1.f);
}
}
}
if (num_heads > 1) {
return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
}
return build_rope_matrix_from_frequencies(freqs, dim);
}
@ -808,7 +705,6 @@ namespace LTXV {
int64_t height,
int64_t frames,
int dim,
int num_heads,
float frame_rate,
float theta,
int max_pos_t,
@ -829,7 +725,7 @@ namespace LTXV {
}
}
}
return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
return build_1d_rope_matrix_from_coords(coords, dim, theta, static_cast<float>(max_pos_t));
}
__STATIC_INLINE__ float audio_latent_start_time_sec(int64_t latent_index,
@ -846,7 +742,6 @@ namespace LTXV {
__STATIC_INLINE__ std::vector<float> build_audio_rope_matrix(int64_t seq_len,
int dim,
int num_heads,
float theta = 10000.f,
int max_pos_t = 20,
bool use_middle_indices_grid = false) {
@ -860,7 +755,7 @@ namespace LTXV {
coords[static_cast<size_t>(t)] = start;
}
}
return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
return build_1d_rope_matrix_from_coords(coords, dim, theta, static_cast<float>(max_pos_t));
}
struct BasicAVTransformerBlock : public GGMLBlock {
@ -930,7 +825,7 @@ namespace LTXV {
t = ggml_repeat(ctx->ggml_ctx, t, e);
s = ggml_repeat(ctx->ggml_ctx, s, e);
auto out = ggml_add(ctx->ggml_ctx, s, t);
auto chunks = ggml_ext_chunk(ctx->ggml_ctx, out, static_cast<int>(coeff), 1);
auto chunks = ggml_ext_chunk(ctx->ggml_ctx, out, coeff, 1);
return std::vector<ggml_tensor*>(chunks.begin() + start, chunks.begin() + start + count);
}
@ -1109,23 +1004,11 @@ namespace LTXV {
blocks["av_ca_v2a_gate_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, 1);
if (cfg.use_caption_projection) {
if (cfg.caption_proj_before_connector) {
if (cfg.caption_projection_first_linear) {
blocks["caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.hidden_size);
}
} else {
blocks["caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.hidden_size, cfg.hidden_size);
}
}
if (cfg.use_audio_caption_projection) {
if (cfg.caption_proj_before_connector) {
if (cfg.caption_projection_first_linear) {
blocks["audio_caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.audio_hidden_size);
}
} else {
blocks["audio_caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.audio_hidden_size, cfg.audio_hidden_size);
}
}
if (cfg.use_connector) {
blocks["video_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(cfg.connector_hidden_size,
@ -1197,97 +1080,42 @@ namespace LTXV {
std::pair<ggml_tensor*, ggml_tensor*> preprocess_contexts(GGMLRunnerContext* ctx,
ggml_tensor* context,
ggml_tensor* video_connector_pe,
ggml_tensor* audio_connector_pe,
bool process_audio_context) {
ggml_tensor* audio_connector_pe) {
if (context == nullptr) {
return {nullptr, nullptr};
}
bool is_fully_processed_context =
context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
context->ne[1] >= 1024;
bool is_unprocessed_dual_context =
context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
context->ne[1] < 1024;
if (is_fully_processed_context) {
auto v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
ggml_tensor* a_context = nullptr;
if (process_audio_context) {
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
}
return {v_context, a_context};
if (context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim) {
return {
ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim),
ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim)
};
}
ggml_tensor* v_context = context;
ggml_tensor* a_context = process_audio_context ? context : nullptr;
if (is_unprocessed_dual_context) {
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
if (process_audio_context) {
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
}
} else if (context->ne[0] == cfg.caption_channels * 2) {
ggml_tensor* a_context = context;
if (context->ne[0] == cfg.caption_channels * 2) {
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.caption_channels);
if (process_audio_context) {
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.caption_channels, cfg.caption_channels * 2);
}
}
if (cfg.caption_proj_before_connector) {
if (cfg.use_caption_projection &&
blocks.count("caption_projection") > 0 &&
v_context != nullptr &&
v_context->ne[0] == cfg.caption_channels) {
auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["caption_projection"]);
if (caption_projection != nullptr) {
v_context = caption_projection->forward(ctx, v_context);
}
}
if (process_audio_context &&
cfg.use_audio_caption_projection &&
blocks.count("audio_caption_projection") > 0 &&
a_context != nullptr &&
a_context->ne[0] == cfg.caption_channels) {
auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["audio_caption_projection"]);
if (caption_projection != nullptr) {
a_context = caption_projection->forward(ctx, a_context);
}
}
}
if (cfg.use_connector && v_context != nullptr && v_context->ne[0] == cfg.connector_hidden_size) {
auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["video_embeddings_connector"]);
v_context = connector->forward(ctx, v_context, video_connector_pe);
}
if (process_audio_context &&
cfg.use_audio_connector &&
a_context != nullptr &&
a_context->ne[0] == cfg.audio_connector_hidden_size) {
if (cfg.use_audio_connector && a_context != nullptr && a_context->ne[0] == cfg.audio_connector_hidden_size) {
auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["audio_embeddings_connector"]);
a_context = connector->forward(ctx, a_context, audio_connector_pe);
}
if (!cfg.caption_proj_before_connector &&
cfg.use_caption_projection &&
blocks.count("caption_projection") > 0 &&
v_context != nullptr &&
v_context->ne[0] == cfg.caption_channels) {
if (cfg.use_caption_projection && v_context != nullptr && v_context->ne[0] == cfg.caption_channels) {
auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["caption_projection"]);
if (caption_projection != nullptr) {
v_context = caption_projection->forward(ctx, v_context);
}
}
if (process_audio_context &&
!cfg.caption_proj_before_connector &&
cfg.use_audio_caption_projection &&
blocks.count("audio_caption_projection") > 0 &&
a_context != nullptr &&
a_context->ne[0] == cfg.caption_channels) {
if (cfg.use_audio_caption_projection && a_context != nullptr && a_context->ne[0] == cfg.caption_channels) {
auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["audio_caption_projection"]);
if (caption_projection != nullptr) {
a_context = caption_projection->forward(ctx, a_context);
}
}
return {v_context, a_context};
}
@ -1340,13 +1168,9 @@ namespace LTXV {
ax = nullptr;
}
bool run_ax = ax != nullptr && ggml_nelements(ax) > 0 && audio_time > 0;
auto contexts = preprocess_contexts(ctx, context, video_connector_pe, audio_connector_pe, run_ax);
auto contexts = preprocess_contexts(ctx, context, video_connector_pe, audio_connector_pe);
auto v_context = contexts.first;
auto a_context = contexts.second != nullptr ? contexts.second : contexts.first;
if (contexts.second != nullptr) {
a_context = ggml_cont(ctx->ggml_ctx, a_context);
}
auto v_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, timestep, cfg.timestep_scale_multiplier);
auto v_pair = adaln_single->forward(ctx, v_timestep_scaled);
@ -1433,8 +1257,6 @@ namespace LTXV {
std::vector<float> audio_cross_pe_vec;
std::vector<float> connector_pe_vec;
std::vector<float> audio_connector_pe_vec;
sd::Tensor<float> vx_input_cache;
sd::Tensor<float> ax_input_cache;
static int64_t infer_gate_heads(const String2TensorStorage& tensor_storage_map,
const std::string& bias_name,
@ -1566,7 +1388,7 @@ namespace LTXV {
model.get_param_tensors(tensors, prefix);
}
std::pair<sd::Tensor<float>, sd::Tensor<float>> split_av_latents(const sd::Tensor<float>& x_tensor,
std::pair<sd::Tensor<float>, sd::Tensor<float>> separate_audio_and_video_latents(const sd::Tensor<float>& x_tensor,
int audio_length) const {
if (x_tensor.empty()) {
return {{}, {}};
@ -1602,7 +1424,7 @@ namespace LTXV {
return {vx, ax};
}
ggml_tensor* merge_av_latents(ggml_context* ctx,
ggml_tensor* recombine_audio_and_video_latents(ggml_context* ctx,
ggml_tensor* vx,
ggml_tensor* ax) const {
if (ax == nullptr || ggml_nelements(ax) == 0 || ax->ne[1] == 0) {
@ -1633,16 +1455,12 @@ namespace LTXV {
const sd::Tensor<float>& audio_x_tensor = {},
const sd::Tensor<float>& audio_timesteps_tensor = {},
int audio_length = 0) {
auto split_inputs = split_av_latents(x_tensor, audio_length);
vx_input_cache = split_inputs.first;
if (!audio_x_tensor.empty()) {
ax_input_cache = audio_x_tensor;
} else {
ax_input_cache = split_inputs.second;
}
auto split_inputs = separate_audio_and_video_latents(x_tensor, audio_length);
const sd::Tensor<float>& vx_tensor = split_inputs.first;
const sd::Tensor<float>& ax_tensor = !audio_x_tensor.empty() ? audio_x_tensor : split_inputs.second;
ggml_tensor* vx = make_input(vx_input_cache);
ggml_tensor* ax = make_optional_input(ax_input_cache);
ggml_tensor* vx = make_input(vx_tensor);
ggml_tensor* ax = make_optional_input(ax_tensor);
ggml_tensor* timesteps = make_input(timesteps_tensor);
ggml_tensor* a_timestep = make_optional_input(audio_timesteps_tensor);
ggml_tensor* context = make_optional_input(context_tensor);
@ -1653,15 +1471,12 @@ namespace LTXV {
vx->ne[1],
vx->ne[2],
static_cast<int>(params.hidden_size),
static_cast<int>(params.num_attention_heads),
24.f,
25.f,
params.positional_embedding_theta,
params.positional_embedding_max_pos,
params.vae_scale_factors,
params.causal_temporal_positioning,
params.use_middle_indices_grid);
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.num_attention_heads);
ggml_set_name(video_pe, "ltxav_video_pe");
params.causal_temporal_positioning);
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.hidden_size / 2, vx->ne[0] * vx->ne[1] * vx->ne[2]);
set_backend_tensor_data(video_pe, video_pe_vec.data());
ggml_tensor* audio_pe = nullptr;
@ -1670,12 +1485,10 @@ namespace LTXV {
if (ax != nullptr && ggml_nelements(ax) > 0 && ax->ne[1] > 0) {
audio_pe_vec = build_audio_rope_matrix(ax->ne[1],
static_cast<int>(params.audio_hidden_size),
static_cast<int>(params.audio_num_attention_heads),
params.positional_embedding_theta,
params.audio_positional_embedding_max_pos[0],
params.use_middle_indices_grid);
audio_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
ggml_set_name(audio_pe, "ltxav_audio_pe");
false);
audio_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_hidden_size / 2, ax->ne[1]);
set_backend_tensor_data(audio_pe, audio_pe_vec.data());
int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]);
@ -1683,68 +1496,43 @@ namespace LTXV {
vx->ne[1],
vx->ne[2],
static_cast<int>(params.audio_cross_attention_dim),
static_cast<int>(params.audio_num_attention_heads),
25.f,
params.positional_embedding_theta,
temporal_max_pos,
std::get<0>(params.vae_scale_factors),
params.causal_temporal_positioning,
true);
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.audio_num_attention_heads);
ggml_set_name(video_cross_pe, "ltxav_video_cross_pe");
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_cross_attention_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2]);
set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data());
audio_cross_pe_vec = build_audio_rope_matrix(ax->ne[1],
static_cast<int>(params.audio_cross_attention_dim),
static_cast<int>(params.audio_num_attention_heads),
params.positional_embedding_theta,
temporal_max_pos,
true);
audio_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
ggml_set_name(audio_cross_pe, "ltxav_audio_cross_pe");
audio_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_cross_attention_dim / 2, ax->ne[1]);
set_backend_tensor_data(audio_cross_pe, audio_cross_pe_vec.data());
}
bool needs_video_connector_pe =
params.use_connector &&
context != nullptr &&
(context->ne[0] == params.connector_hidden_size ||
((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
context->ne[0] == params.caption_channels * 2) &&
context->ne[1] < 1024));
ggml_tensor* video_connector_pe = nullptr;
if (needs_video_connector_pe) {
if (params.use_connector && context != nullptr && context->ne[0] == params.connector_hidden_size) {
int64_t seq_len = context->ne[1];
int64_t target_len = std::max<int64_t>(1024, seq_len);
int64_t duplications = (target_len + params.connector_num_registers - 1) / params.connector_num_registers;
int64_t full_len = seq_len + duplications * params.connector_num_registers - seq_len;
connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.connector_hidden_size), static_cast<int>(params.connector_num_heads), 10000.f, 4096.f, true);
video_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.connector_head_dim / 2, full_len * params.connector_num_heads);
ggml_set_name(video_connector_pe, "ltxav_video_connector_pe");
connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.connector_hidden_size));
video_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.connector_hidden_size / 2, full_len);
set_backend_tensor_data(video_connector_pe, connector_pe_vec.data());
}
bool run_audio_context =
ax != nullptr &&
ggml_nelements(ax) > 0 &&
ax->ne[1] > 0;
bool needs_audio_connector_pe =
run_audio_context &&
params.use_audio_connector &&
context != nullptr &&
(context->ne[0] == params.audio_connector_hidden_size ||
((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
context->ne[0] == params.caption_channels * 2) &&
context->ne[1] < 1024));
ggml_tensor* audio_connector_pe = nullptr;
if (needs_audio_connector_pe) {
if (params.use_audio_connector && context != nullptr && context->ne[0] == params.audio_connector_hidden_size) {
int64_t seq_len = context->ne[1];
int64_t target_len = std::max<int64_t>(1024, seq_len);
int64_t duplications = (target_len + params.audio_connector_num_registers - 1) / params.audio_connector_num_registers;
int64_t full_len = seq_len + duplications * params.audio_connector_num_registers - seq_len;
audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.audio_connector_hidden_size), static_cast<int>(params.audio_connector_num_heads), 10000.f, 4096.f, true);
audio_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_connector_head_dim / 2, full_len * params.audio_connector_num_heads);
ggml_set_name(audio_connector_pe, "ltxav_audio_connector_pe");
audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.audio_connector_hidden_size));
audio_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_connector_hidden_size / 2, full_len);
set_backend_tensor_data(audio_connector_pe, audio_connector_pe_vec.data());
}
@ -1761,7 +1549,7 @@ namespace LTXV {
audio_cross_pe,
video_connector_pe,
audio_connector_pe);
auto out = merge_av_latents(compute_ctx, out_pair.first, out_pair.second);
auto out = recombine_audio_and_video_latents(compute_ctx, out_pair.first, out_pair.second);
ggml_build_forward_expand(gf, out);
return gf;
}
@ -1776,106 +1564,7 @@ namespace LTXV {
auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length);
};
auto out = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
return out;
}
void test(const std::string& x_path,
const std::string& timesteps_path = "",
const std::string& context_path = "",
const std::string& audio_x_path = "",
const std::string& audio_timesteps_path = "") {
auto x = sd::load_tensor_from_file_as_tensor<float>(x_path);
GGML_ASSERT(!x.empty());
print_sd_tensor(x, false, "ltxav_x");
sd::Tensor<float> timesteps;
if (!timesteps_path.empty()) {
timesteps = sd::load_tensor_from_file_as_tensor<float>(timesteps_path);
} else {
timesteps = sd::Tensor<float>::from_vector(std::vector<float>{1.f});
}
GGML_ASSERT(!timesteps.empty());
print_sd_tensor(timesteps, false, "ltxav_timesteps");
sd::Tensor<float> context;
if (!context_path.empty()) {
context = sd::load_tensor_from_file_as_tensor<float>(context_path);
GGML_ASSERT(!context.empty());
print_sd_tensor(context, false, "ltxav_context");
}
sd::Tensor<float> audio_x;
int audio_length = 0;
if (!audio_x_path.empty()) {
audio_x = sd::load_tensor_from_file_as_tensor<float>(audio_x_path);
GGML_ASSERT(!audio_x.empty());
GGML_ASSERT(audio_x.dim() >= 2);
audio_length = static_cast<int>(audio_x.shape()[1]);
print_sd_tensor(audio_x, false, "ltxav_audio_x");
}
sd::Tensor<float> audio_timesteps;
if (!audio_timesteps_path.empty()) {
audio_timesteps = sd::load_tensor_from_file_as_tensor<float>(audio_timesteps_path);
GGML_ASSERT(!audio_timesteps.empty());
} else if (!audio_x.empty()) {
audio_timesteps = timesteps;
}
if (!audio_timesteps.empty()) {
print_sd_tensor(audio_timesteps, false, "ltxav_audio_timesteps");
}
int64_t t0 = ggml_time_ms();
auto out_opt = compute(8, x, timesteps, context, audio_x, audio_timesteps, audio_length);
int64_t t1 = ggml_time_ms();
GGML_ASSERT(!out_opt.empty());
print_sd_tensor(out_opt, false, "ltxav_out");
LOG_DEBUG("ltxav test done in %lldms", t1 - t0);
}
static void load_from_file_and_test(const std::string& model_path,
const std::string& x_path,
const std::string& timesteps_path = "",
const std::string& context_path = "",
const std::string& embeddings_path = "",
const std::string& audio_x_path = "",
const std::string& audio_timesteps_path = "") {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
LOG_INFO("loading ltxav from '%s'", model_path.c_str());
ModelLoader model_loader;
if (!model_loader.init_from_file_and_convert_name(model_path, "model.diffusion_model.")) {
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
return;
}
if (!embeddings_path.empty()) {
LOG_INFO("loading ltxav embeddings from '%s'", embeddings_path.c_str());
if (!model_loader.init_from_file(embeddings_path)) {
LOG_ERROR("init embeddings model loader from file failed: '%s'", embeddings_path.c_str());
return;
}
}
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
std::shared_ptr<LTXAVRunner> ltxav = std::make_shared<LTXAVRunner>(backend,
false,
tensor_storage_map,
"model.diffusion_model");
ltxav->alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors;
ltxav->get_param_tensors(tensors, "model.diffusion_model");
if (!model_loader.load_tensors(tensors)) {
LOG_ERROR("load tensors from model loader failed");
return;
}
LOG_INFO("ltxav model loaded");
ltxav->test(x_path, timesteps_path, context_path, audio_x_path, audio_timesteps_path);
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
};

View File

@ -14,7 +14,7 @@
#include "diffusion_model.hpp"
#include "esrgan.hpp"
#include "lora.hpp"
#include "ltx_vae.hpp"
#include "ltx_vae.h"
#include "pmid.hpp"
#include "sample-cache.h"
#include "tae.hpp"
@ -3742,8 +3742,8 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
}
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
latents.audio_length = 0;
latents.audio_latent = {};
latents.audio_length = get_ltxav_num_audio_latents(request->frames, request->fps);
latents.audio_latent = sd::zeros<float>({16, latents.audio_length, 8, 1});
}
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
@ -3923,8 +3923,9 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
}
// Pipeline-level audio support is temporarily disabled. Keep the model-side
// AV implementation intact, but feed pure video latents through vid_gen.
if (!latents.audio_latent.empty()) {
latents.init_latent = pack_ltxav_audio_and_video_latents(latents.init_latent, latents.audio_latent);
}
return latents;
}

Binary file not shown.