mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-05-08 16:28:53 +00:00
Compare commits
1 Commits
2ca782a65a
...
ca7e008d78
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ca7e008d78 |
@ -19,6 +19,8 @@
|
||||
#include "common/media_io.h"
|
||||
#include "common/resource_owners.hpp"
|
||||
#include "image_metadata.h"
|
||||
#include "llm.hpp"
|
||||
#include "ltx_vae_test.h"
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
@ -500,6 +502,27 @@ int main(int argc, const char* argv[]) {
|
||||
SDContextParams ctx_params;
|
||||
SDGenerationParams gen_params;
|
||||
|
||||
cli_params.verbose = true;
|
||||
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
||||
{
|
||||
const bool run_ltx_vae_test = false;
|
||||
const std::string model_path = "E:/Code/ComfyUI/models/vae/ltx-2.3-22b-dev_video_vae.safetensors";
|
||||
const std::string input_path = "E:/Code/sd.cpp/build/ltx_vae_z.bin";
|
||||
if (run_ltx_vae_test) {
|
||||
ltx_vae_load_from_file_and_test(model_path, input_path);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// cli_params.verbose = true;
|
||||
// sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
||||
// GemmaTokenizer tokenizer;
|
||||
// auto tokens = tokenizer.tokenize("<html> 一只可爱的小猫");
|
||||
// for (auto token : tokens) {
|
||||
// LOG_INFO("%d", token);
|
||||
// }
|
||||
// return 0;
|
||||
|
||||
parse_args(argc, argv, cli_params, ctx_params, gen_params);
|
||||
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
||||
log_verbose = cli_params.verbose;
|
||||
|
||||
@ -103,64 +103,6 @@ namespace DiT {
|
||||
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
|
||||
return x;
|
||||
}
|
||||
|
||||
inline ggml_tensor* patchify(ggml_context* ctx,
|
||||
ggml_tensor* x,
|
||||
int pt,
|
||||
int ph,
|
||||
int pw,
|
||||
int64_t N = 1) {
|
||||
// x: [N*C, T, H, W]
|
||||
// return: [N, h*w, C*pt*ph*pw]
|
||||
int64_t C = x->ne[3] / N;
|
||||
int64_t T = x->ne[2];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t W = x->ne[0];
|
||||
int64_t t_len = T / pt;
|
||||
int64_t h_len = H / ph;
|
||||
int64_t w_len = W / pw;
|
||||
|
||||
GGML_ASSERT(C * N == x->ne[3]);
|
||||
GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
|
||||
|
||||
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N); // [N*C*t_len, pt, h_len*ph, w_len*pw]
|
||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, h_len*ph, pt, w_len*pw]
|
||||
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N); // [N*C*t_len*h_len, ph, pt, w_len*pw]
|
||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt, ph, w_len*pw]
|
||||
x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt*ph, w_len, pw]
|
||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, w_len, pt*ph, pw]
|
||||
x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N); // [N, C, t_len*h_len*w_len, pt*ph*pw]
|
||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N, t_len*h_len*w_len, C, pt*ph*pw]
|
||||
x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1); // [N, t_len*h_len*w_len, C*pt*ph*pw]
|
||||
return x;
|
||||
}
|
||||
|
||||
inline ggml_tensor* unpatchify(ggml_context* ctx,
|
||||
ggml_tensor* x,
|
||||
int64_t t_len,
|
||||
int64_t h_len,
|
||||
int64_t w_len,
|
||||
int pt,
|
||||
int ph,
|
||||
int pw) {
|
||||
// x: [N, t_len*h_len*w_len, pt*ph*pw*C]
|
||||
// return: [N*C, t_len*pt, h_len*ph, w_len*pw]
|
||||
int64_t N = x->ne[3];
|
||||
int64_t C = x->ne[0] / pt / ph / pw;
|
||||
|
||||
GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
|
||||
|
||||
x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N); // [N, t_len*h_len*w_len, pt*ph*pw, C]
|
||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, t_len*h_len*w_len, pt*ph*pw]
|
||||
x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N); // [N*C*t_len*h_len, w_len, pt*ph, pw]
|
||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt*ph, w_len, pw]
|
||||
x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt, ph, w_len*pw]
|
||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, ph, pt, w_len*pw]
|
||||
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N); // [N*C*t_len, h_len*ph, pt, w_len*pw]
|
||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, pt, h_len*ph, w_len*pw]
|
||||
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N); // [N*C, t_len*pt, h_len*ph, w_len*pw]
|
||||
return x;
|
||||
}
|
||||
} // namespace DiT
|
||||
|
||||
#endif // __COMMON_DIT_HPP__
|
||||
|
||||
@ -1682,15 +1682,6 @@ struct GGMLRunnerContext {
|
||||
bool circular_x_enabled = false;
|
||||
bool circular_y_enabled = false;
|
||||
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
|
||||
std::unordered_map<ggml_tensor*, std::string>* debug_tensors = nullptr;
|
||||
|
||||
void capture_tensor(const std::string& name, ggml_tensor* tensor) {
|
||||
if (debug_tensors == nullptr || tensor == nullptr) {
|
||||
return;
|
||||
}
|
||||
ggml_set_output(tensor);
|
||||
(*debug_tensors)[tensor] = name;
|
||||
}
|
||||
};
|
||||
|
||||
struct GGMLRunner {
|
||||
@ -1722,7 +1713,6 @@ protected:
|
||||
|
||||
std::map<ggml_tensor*, const void*> backend_tensor_data_map;
|
||||
std::map<std::string, ggml_tensor*> cache_tensor_map; // name -> tensor
|
||||
std::unordered_map<ggml_tensor*, std::string> debug_tensors;
|
||||
const std::string final_result_name = "ggml_runner_final_result_tensor";
|
||||
|
||||
bool flash_attn_enabled = false;
|
||||
@ -1809,7 +1799,6 @@ protected:
|
||||
}
|
||||
|
||||
void free_compute_ctx() {
|
||||
debug_tensors.clear();
|
||||
if (compute_ctx != nullptr) {
|
||||
ggml_free(compute_ctx);
|
||||
compute_ctx = nullptr;
|
||||
@ -1845,11 +1834,6 @@ protected:
|
||||
auto result = ggml_graph_node(gf, -1);
|
||||
ggml_set_name(result, final_result_name.c_str());
|
||||
}
|
||||
for (const auto& entry : debug_tensors) {
|
||||
if (entry.first != nullptr) {
|
||||
ggml_build_forward_expand(gf, entry.first);
|
||||
}
|
||||
}
|
||||
prepare_build_in_tensor_after(gf);
|
||||
return gf;
|
||||
}
|
||||
@ -1919,21 +1903,6 @@ protected:
|
||||
for (auto& kv : backend_tensor_data_map) {
|
||||
auto tensor = kv.first;
|
||||
auto data = kv.second;
|
||||
if (tensor == nullptr || data == nullptr) {
|
||||
continue;
|
||||
}
|
||||
const char* name = ggml_get_name(tensor);
|
||||
if (tensor->buffer == nullptr) {
|
||||
LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
|
||||
get_desc().c_str(),
|
||||
name != nullptr ? name : "",
|
||||
(long long)tensor->ne[0],
|
||||
(long long)tensor->ne[1],
|
||||
(long long)tensor->ne[2],
|
||||
(long long)tensor->ne[3],
|
||||
ggml_type_name(tensor->type));
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
|
||||
}
|
||||
@ -2056,7 +2025,6 @@ public:
|
||||
runner_ctx.circular_x_enabled = circular_x_enabled;
|
||||
runner_ctx.circular_y_enabled = circular_y_enabled;
|
||||
runner_ctx.weight_adapter = weight_adapter;
|
||||
runner_ctx.debug_tensors = &debug_tensors;
|
||||
return runner_ctx;
|
||||
}
|
||||
|
||||
@ -2195,21 +2163,6 @@ public:
|
||||
LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
|
||||
return std::nullopt;
|
||||
}
|
||||
for (const auto& entry : debug_tensors) {
|
||||
auto tensor = entry.first;
|
||||
if (tensor == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (tensor->type != GGML_TYPE_F32) {
|
||||
LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s",
|
||||
get_desc().c_str(),
|
||||
entry.second.c_str(),
|
||||
ggml_type_name(tensor->type));
|
||||
continue;
|
||||
}
|
||||
auto debug_tensor = sd::make_sd_tensor_from_ggml<float>(tensor);
|
||||
print_sd_tensor(debug_tensor, false, entry.second.c_str());
|
||||
}
|
||||
copy_cache_tensors_to_cache_buffer();
|
||||
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
|
||||
std::optional<sd::Tensor<T>> output;
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __SD_LTX_VAE_HPP__
|
||||
#define __SD_LTX_VAE_HPP__
|
||||
#ifndef __SD_LTX_VAE_H__
|
||||
#define __SD_LTX_VAE_H__
|
||||
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
@ -936,8 +936,7 @@ struct LTXVideoVAE : public VAE {
|
||||
|
||||
static void load_from_file_and_test(const std::string& model_path,
|
||||
const std::string& input_path) {
|
||||
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||
ggml_backend_t backend = ggml_backend_cpu_init();
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||
LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
|
||||
|
||||
ModelLoader model_loader;
|
||||
@ -968,4 +967,4 @@ struct LTXVideoVAE : public VAE {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __SD_LTX_VAE_HPP__
|
||||
#endif // __SD_LTX_VAE_H__
|
||||
8
src/ltx_vae_test.cpp
Normal file
8
src/ltx_vae_test.cpp
Normal file
@ -0,0 +1,8 @@
|
||||
#include "ltx_vae_test.h"
|
||||
|
||||
#include "ltx_vae.h"
|
||||
|
||||
void ltx_vae_load_from_file_and_test(const std::string& model_path,
|
||||
const std::string& input_path) {
|
||||
LTXVideoVAE::load_from_file_and_test(model_path, input_path);
|
||||
}
|
||||
9
src/ltx_vae_test.h
Normal file
9
src/ltx_vae_test.h
Normal file
@ -0,0 +1,9 @@
|
||||
#ifndef __SD_LTX_VAE_TEST_H__
|
||||
#define __SD_LTX_VAE_TEST_H__
|
||||
|
||||
#include <string>
|
||||
|
||||
void ltx_vae_load_from_file_and_test(const std::string& model_path,
|
||||
const std::string& input_path);
|
||||
|
||||
#endif // __SD_LTX_VAE_TEST_H__
|
||||
429
src/ltxv.hpp
429
src/ltxv.hpp
@ -1,5 +1,5 @@
|
||||
#ifndef __SD_LTXV_HPP__
|
||||
#define __SD_LTXV_HPP__
|
||||
#ifndef __LTXV_HPP__
|
||||
#define __LTXV_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
@ -79,30 +79,6 @@ namespace LTXV {
|
||||
return out;
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<double> generate_freq_grid_double(double theta,
|
||||
int positional_dims,
|
||||
int dim) {
|
||||
const int n_elem = 2 * positional_dims;
|
||||
const int freq_count = dim / n_elem;
|
||||
|
||||
std::vector<double> out(freq_count);
|
||||
if (freq_count <= 0) {
|
||||
return out;
|
||||
}
|
||||
if (freq_count == 1) {
|
||||
out[0] = 1.5707963267948966;
|
||||
return out;
|
||||
}
|
||||
|
||||
const double half_pi = 1.5707963267948966;
|
||||
const double log_theta = std::log(theta);
|
||||
for (int i = 0; i < freq_count; i++) {
|
||||
double ratio = static_cast<double>(i) / static_cast<double>(freq_count - 1);
|
||||
out[i] = std::exp(log_theta * ratio) * half_pi;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<float> build_rope_matrix_from_frequencies(
|
||||
const std::vector<std::vector<float>>& frequencies,
|
||||
int dim) {
|
||||
@ -126,43 +102,16 @@ namespace LTXV {
|
||||
return out;
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<std::vector<float>> split_frequencies_by_heads(
|
||||
const std::vector<std::vector<float>>& frequencies,
|
||||
int inner_dim,
|
||||
int num_heads) {
|
||||
GGML_ASSERT(num_heads > 0);
|
||||
GGML_ASSERT(inner_dim % num_heads == 0);
|
||||
const int inner_half_dim = inner_dim / 2;
|
||||
const int per_head_half_dim = inner_half_dim / num_heads;
|
||||
GGML_ASSERT(inner_half_dim % num_heads == 0);
|
||||
|
||||
std::vector<std::vector<float>> out(
|
||||
frequencies.size() * static_cast<size_t>(num_heads),
|
||||
std::vector<float>(per_head_half_dim, 0.f));
|
||||
|
||||
for (size_t token = 0; token < frequencies.size(); token++) {
|
||||
GGML_ASSERT(static_cast<int>(frequencies[token].size()) == inner_half_dim);
|
||||
for (int head = 0; head < num_heads; head++) {
|
||||
auto& dst = out[token * static_cast<size_t>(num_heads) + static_cast<size_t>(head)];
|
||||
std::copy_n(frequencies[token].begin() + head * per_head_half_dim, per_head_half_dim, dst.begin());
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<float> build_video_rope_matrix(int64_t width,
|
||||
int64_t height,
|
||||
int64_t frames,
|
||||
int dim,
|
||||
int num_heads = 1,
|
||||
float frame_rate = 25.f,
|
||||
float theta = 10000.f,
|
||||
const std::vector<int>& max_pos = {20, 2048, 2048},
|
||||
const std::tuple<int, int, int>& vae_scale_factors = {8, 32, 32},
|
||||
bool causal_temporal_positioning = false,
|
||||
bool use_middle_indices_grid = false) {
|
||||
bool causal_temporal_positioning = false) {
|
||||
GGML_ASSERT(max_pos.size() == 3);
|
||||
GGML_ASSERT(dim % num_heads == 0);
|
||||
const std::vector<float> indices = generate_freq_grid(theta, 3, dim);
|
||||
const int half_dim = dim / 2;
|
||||
const int pad_size = half_dim - static_cast<int>(indices.size()) * 3;
|
||||
@ -180,25 +129,11 @@ namespace LTXV {
|
||||
pixel_t = std::max(0.f, pixel_t + 1.f - scale_t);
|
||||
}
|
||||
pixel_t /= frame_rate;
|
||||
if (use_middle_indices_grid) {
|
||||
float end = static_cast<float>((t + 1) * scale_t);
|
||||
if (causal_temporal_positioning) {
|
||||
end = std::max(0.f, end + 1.f - scale_t);
|
||||
}
|
||||
end /= frame_rate;
|
||||
pixel_t = 0.5f * (pixel_t + end);
|
||||
}
|
||||
|
||||
for (int64_t h = 0; h < height; h++) {
|
||||
float pixel_h = static_cast<float>(h * scale_h);
|
||||
if (use_middle_indices_grid) {
|
||||
pixel_h += 0.5f * static_cast<float>(scale_h);
|
||||
}
|
||||
for (int64_t w = 0; w < width; w++) {
|
||||
float pixel_w = static_cast<float>(w * scale_w);
|
||||
if (use_middle_indices_grid) {
|
||||
pixel_w += 0.5f * static_cast<float>(scale_w);
|
||||
}
|
||||
|
||||
int out_idx = 0;
|
||||
for (int i = 0; i < pad_size; i++) {
|
||||
@ -211,6 +146,13 @@ namespace LTXV {
|
||||
pixel_w / max_pos[2],
|
||||
};
|
||||
|
||||
// Match ComfyUI generate_freqs():
|
||||
// (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
|
||||
// .transpose(-1, -2)
|
||||
// .flatten(2)
|
||||
// After the transpose, the half-dim order is:
|
||||
// [t_f0, h_f0, w_f0, t_f1, h_f1, w_f1, ...]
|
||||
// not [t_f0, t_f1, ..., h_f0, h_f1, ..., w_f0, w_f1, ...].
|
||||
for (float index : indices) {
|
||||
for (int axis = 0; axis < 3; axis++) {
|
||||
freqs[token][out_idx++] = index * (coords[axis] * 2.f - 1.f);
|
||||
@ -221,24 +163,16 @@ namespace LTXV {
|
||||
}
|
||||
}
|
||||
|
||||
if (num_heads > 1) {
|
||||
return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
|
||||
}
|
||||
return build_rope_matrix_from_frequencies(freqs, dim);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<float> build_1d_rope_matrix(int64_t seq_len,
|
||||
int dim,
|
||||
int num_heads = 1,
|
||||
float theta = 10000.f,
|
||||
float positional_scale = 4096.f,
|
||||
bool double_precision = false) {
|
||||
GGML_ASSERT(dim % num_heads == 0);
|
||||
const std::vector<float> indices = double_precision ? std::vector<float>() : generate_freq_grid(theta, 1, dim);
|
||||
const std::vector<double> indices_d =
|
||||
double_precision ? generate_freq_grid_double(static_cast<double>(theta), 1, dim) : std::vector<double>();
|
||||
float positional_scale = 4096.f) {
|
||||
const std::vector<float> indices = generate_freq_grid(theta, 1, dim);
|
||||
const int half_dim = dim / 2;
|
||||
const int pad_size = half_dim - static_cast<int>(double_precision ? indices_d.size() : indices.size());
|
||||
const int pad_size = half_dim - static_cast<int>(indices.size());
|
||||
|
||||
std::vector<std::vector<float>> freqs(static_cast<size_t>(seq_len), std::vector<float>(half_dim, 0.f));
|
||||
for (int64_t pos = 0; pos < seq_len; pos++) {
|
||||
@ -247,39 +181,20 @@ namespace LTXV {
|
||||
freqs[static_cast<size_t>(pos)][out_idx++] = 0.f;
|
||||
}
|
||||
|
||||
if (double_precision) {
|
||||
double coord = static_cast<double>(pos) / static_cast<double>(positional_scale);
|
||||
for (double index : indices_d) {
|
||||
freqs[static_cast<size_t>(pos)][out_idx++] = static_cast<float>(index * (coord * 2.0 - 1.0));
|
||||
}
|
||||
} else {
|
||||
float coord = static_cast<float>(pos) / positional_scale;
|
||||
for (float index : indices) {
|
||||
freqs[static_cast<size_t>(pos)][out_idx++] = index * (coord * 2.f - 1.f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (num_heads > 1) {
|
||||
return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
|
||||
}
|
||||
return build_rope_matrix_from_frequencies(freqs, dim);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ ggml_tensor* apply_hidden_rope(ggml_context* ctx,
|
||||
ggml_tensor* x,
|
||||
ggml_tensor* pe,
|
||||
int64_t heads,
|
||||
int64_t dim_head,
|
||||
bool rope_interleaved) {
|
||||
GGML_ASSERT(x->ne[0] == heads * dim_head);
|
||||
auto x4 = ggml_reshape_4d(ctx, x, dim_head, heads, x->ne[1], x->ne[2]);
|
||||
if (pe != nullptr && pe->ne[3] == x->ne[1] * heads) {
|
||||
auto x_flat = ggml_reshape_4d(ctx, x4, dim_head, 1, x->ne[1] * heads, x->ne[2]);
|
||||
auto out_flat = Rope::apply_rope(ctx, x_flat, pe, rope_interleaved);
|
||||
auto out4 = ggml_reshape_4d(ctx, out_flat, dim_head, heads, x->ne[1], x->ne[2]);
|
||||
return ggml_reshape_3d(ctx, out4, heads * dim_head, x->ne[1], x->ne[2]);
|
||||
}
|
||||
auto x4 = ggml_reshape_4d(ctx, x, x->ne[0], 1, x->ne[1], x->ne[2]);
|
||||
return Rope::apply_rope(ctx, x4, pe, rope_interleaved);
|
||||
}
|
||||
|
||||
@ -423,8 +338,8 @@ namespace LTXV {
|
||||
if (k_pe == nullptr) {
|
||||
k_pe = pe;
|
||||
}
|
||||
q = apply_hidden_rope(ctx->ggml_ctx, q, pe, heads, dim_head, rope_interleaved);
|
||||
k = apply_hidden_rope(ctx->ggml_ctx, k, k_pe, heads, dim_head, rope_interleaved);
|
||||
q = apply_hidden_rope(ctx->ggml_ctx, q, pe, rope_interleaved);
|
||||
k = apply_hidden_rope(ctx->ggml_ctx, k, k_pe, rope_interleaved);
|
||||
}
|
||||
|
||||
auto out = ggml_ext_attention_ext(ctx->ggml_ctx,
|
||||
@ -500,7 +415,7 @@ namespace LTXV {
|
||||
s = ggml_repeat(ctx->ggml_ctx, s, e);
|
||||
t = ggml_repeat(ctx->ggml_ctx, t, e);
|
||||
auto out = ggml_add(ctx->ggml_ctx, s, t);
|
||||
return ggml_ext_chunk(ctx->ggml_ctx, out, static_cast<int>(coeff), 1);
|
||||
return ggml_ext_chunk(ctx->ggml_ctx, out, coeff, 1);
|
||||
}
|
||||
|
||||
std::vector<ggml_tensor*> get_prompt_scale_shift_values(GGMLRunnerContext* ctx,
|
||||
@ -694,7 +609,7 @@ namespace LTXV {
|
||||
float positional_embedding_theta = 10000.f;
|
||||
std::vector<int> positional_embedding_max_pos = {20, 2048, 2048};
|
||||
std::tuple<int, int, int> vae_scale_factors = {8, 32, 32};
|
||||
bool causal_temporal_positioning = true;
|
||||
bool causal_temporal_positioning = false;
|
||||
float timestep_scale_multiplier = 1000.f;
|
||||
|
||||
int64_t audio_in_channels = 128;
|
||||
@ -726,14 +641,11 @@ namespace LTXV {
|
||||
bool audio_connector_rope_interleaved = false;
|
||||
bool audio_connector_apply_gated_attention = false;
|
||||
|
||||
bool video_rope_interleaved = false;
|
||||
bool use_middle_indices_grid = true;
|
||||
bool video_rope_interleaved = true;
|
||||
bool cross_attention_adaln = false;
|
||||
|
||||
bool use_caption_projection = true;
|
||||
bool use_audio_caption_projection = true;
|
||||
bool caption_proj_before_connector = true;
|
||||
bool caption_projection_first_linear = false;
|
||||
|
||||
bool self_attention_gated = false;
|
||||
bool cross_attention_gated = false;
|
||||
@ -758,16 +670,11 @@ namespace LTXV {
|
||||
|
||||
__STATIC_INLINE__ std::vector<float> build_1d_rope_matrix_from_coords(const std::vector<float>& coords,
|
||||
int dim,
|
||||
int num_heads = 1,
|
||||
float theta = 10000.f,
|
||||
float max_pos = 20.f,
|
||||
bool double_precision = false) {
|
||||
GGML_ASSERT(dim % num_heads == 0);
|
||||
const std::vector<float> indices = double_precision ? std::vector<float>() : generate_freq_grid(theta, 1, dim);
|
||||
const std::vector<double> indices_d =
|
||||
double_precision ? generate_freq_grid_double(static_cast<double>(theta), 1, dim) : std::vector<double>();
|
||||
float max_pos = 20.f) {
|
||||
const std::vector<float> indices = generate_freq_grid(theta, 1, dim);
|
||||
const int half_dim = dim / 2;
|
||||
const int pad_size = half_dim - static_cast<int>(double_precision ? indices_d.size() : indices.size());
|
||||
const int pad_size = half_dim - static_cast<int>(indices.size());
|
||||
|
||||
std::vector<std::vector<float>> freqs(coords.size(), std::vector<float>(half_dim, 0.f));
|
||||
for (size_t pos = 0; pos < coords.size(); pos++) {
|
||||
@ -775,21 +682,11 @@ namespace LTXV {
|
||||
for (int i = 0; i < pad_size; i++) {
|
||||
freqs[pos][out_idx++] = 0.f;
|
||||
}
|
||||
if (double_precision) {
|
||||
double coord = static_cast<double>(coords[pos]) / static_cast<double>(max_pos);
|
||||
for (double index : indices_d) {
|
||||
freqs[pos][out_idx++] = static_cast<float>(index * (coord * 2.0 - 1.0));
|
||||
}
|
||||
} else {
|
||||
float coord = coords[pos] / max_pos;
|
||||
for (float index : indices) {
|
||||
freqs[pos][out_idx++] = index * (coord * 2.f - 1.f);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (num_heads > 1) {
|
||||
return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
|
||||
}
|
||||
return build_rope_matrix_from_frequencies(freqs, dim);
|
||||
}
|
||||
|
||||
@ -808,7 +705,6 @@ namespace LTXV {
|
||||
int64_t height,
|
||||
int64_t frames,
|
||||
int dim,
|
||||
int num_heads,
|
||||
float frame_rate,
|
||||
float theta,
|
||||
int max_pos_t,
|
||||
@ -829,7 +725,7 @@ namespace LTXV {
|
||||
}
|
||||
}
|
||||
}
|
||||
return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
|
||||
return build_1d_rope_matrix_from_coords(coords, dim, theta, static_cast<float>(max_pos_t));
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ float audio_latent_start_time_sec(int64_t latent_index,
|
||||
@ -846,7 +742,6 @@ namespace LTXV {
|
||||
|
||||
__STATIC_INLINE__ std::vector<float> build_audio_rope_matrix(int64_t seq_len,
|
||||
int dim,
|
||||
int num_heads,
|
||||
float theta = 10000.f,
|
||||
int max_pos_t = 20,
|
||||
bool use_middle_indices_grid = false) {
|
||||
@ -860,7 +755,7 @@ namespace LTXV {
|
||||
coords[static_cast<size_t>(t)] = start;
|
||||
}
|
||||
}
|
||||
return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
|
||||
return build_1d_rope_matrix_from_coords(coords, dim, theta, static_cast<float>(max_pos_t));
|
||||
}
|
||||
|
||||
struct BasicAVTransformerBlock : public GGMLBlock {
|
||||
@ -930,7 +825,7 @@ namespace LTXV {
|
||||
t = ggml_repeat(ctx->ggml_ctx, t, e);
|
||||
s = ggml_repeat(ctx->ggml_ctx, s, e);
|
||||
auto out = ggml_add(ctx->ggml_ctx, s, t);
|
||||
auto chunks = ggml_ext_chunk(ctx->ggml_ctx, out, static_cast<int>(coeff), 1);
|
||||
auto chunks = ggml_ext_chunk(ctx->ggml_ctx, out, coeff, 1);
|
||||
return std::vector<ggml_tensor*>(chunks.begin() + start, chunks.begin() + start + count);
|
||||
}
|
||||
|
||||
@ -1109,23 +1004,11 @@ namespace LTXV {
|
||||
blocks["av_ca_v2a_gate_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, 1);
|
||||
|
||||
if (cfg.use_caption_projection) {
|
||||
if (cfg.caption_proj_before_connector) {
|
||||
if (cfg.caption_projection_first_linear) {
|
||||
blocks["caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.hidden_size);
|
||||
}
|
||||
} else {
|
||||
blocks["caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.hidden_size, cfg.hidden_size);
|
||||
}
|
||||
}
|
||||
if (cfg.use_audio_caption_projection) {
|
||||
if (cfg.caption_proj_before_connector) {
|
||||
if (cfg.caption_projection_first_linear) {
|
||||
blocks["audio_caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.audio_hidden_size);
|
||||
}
|
||||
} else {
|
||||
blocks["audio_caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.audio_hidden_size, cfg.audio_hidden_size);
|
||||
}
|
||||
}
|
||||
|
||||
if (cfg.use_connector) {
|
||||
blocks["video_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(cfg.connector_hidden_size,
|
||||
@ -1197,97 +1080,42 @@ namespace LTXV {
|
||||
std::pair<ggml_tensor*, ggml_tensor*> preprocess_contexts(GGMLRunnerContext* ctx,
|
||||
ggml_tensor* context,
|
||||
ggml_tensor* video_connector_pe,
|
||||
ggml_tensor* audio_connector_pe,
|
||||
bool process_audio_context) {
|
||||
ggml_tensor* audio_connector_pe) {
|
||||
if (context == nullptr) {
|
||||
return {nullptr, nullptr};
|
||||
}
|
||||
|
||||
bool is_fully_processed_context =
|
||||
context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
|
||||
context->ne[1] >= 1024;
|
||||
bool is_unprocessed_dual_context =
|
||||
context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
|
||||
context->ne[1] < 1024;
|
||||
|
||||
if (is_fully_processed_context) {
|
||||
auto v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
|
||||
ggml_tensor* a_context = nullptr;
|
||||
if (process_audio_context) {
|
||||
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
|
||||
}
|
||||
return {v_context, a_context};
|
||||
if (context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim) {
|
||||
return {
|
||||
ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim),
|
||||
ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim)
|
||||
};
|
||||
}
|
||||
|
||||
ggml_tensor* v_context = context;
|
||||
ggml_tensor* a_context = process_audio_context ? context : nullptr;
|
||||
if (is_unprocessed_dual_context) {
|
||||
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
|
||||
if (process_audio_context) {
|
||||
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
|
||||
}
|
||||
} else if (context->ne[0] == cfg.caption_channels * 2) {
|
||||
ggml_tensor* a_context = context;
|
||||
if (context->ne[0] == cfg.caption_channels * 2) {
|
||||
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.caption_channels);
|
||||
if (process_audio_context) {
|
||||
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.caption_channels, cfg.caption_channels * 2);
|
||||
}
|
||||
}
|
||||
|
||||
if (cfg.caption_proj_before_connector) {
|
||||
if (cfg.use_caption_projection &&
|
||||
blocks.count("caption_projection") > 0 &&
|
||||
v_context != nullptr &&
|
||||
v_context->ne[0] == cfg.caption_channels) {
|
||||
auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["caption_projection"]);
|
||||
if (caption_projection != nullptr) {
|
||||
v_context = caption_projection->forward(ctx, v_context);
|
||||
}
|
||||
}
|
||||
if (process_audio_context &&
|
||||
cfg.use_audio_caption_projection &&
|
||||
blocks.count("audio_caption_projection") > 0 &&
|
||||
a_context != nullptr &&
|
||||
a_context->ne[0] == cfg.caption_channels) {
|
||||
auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["audio_caption_projection"]);
|
||||
if (caption_projection != nullptr) {
|
||||
a_context = caption_projection->forward(ctx, a_context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (cfg.use_connector && v_context != nullptr && v_context->ne[0] == cfg.connector_hidden_size) {
|
||||
auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["video_embeddings_connector"]);
|
||||
v_context = connector->forward(ctx, v_context, video_connector_pe);
|
||||
}
|
||||
if (process_audio_context &&
|
||||
cfg.use_audio_connector &&
|
||||
a_context != nullptr &&
|
||||
a_context->ne[0] == cfg.audio_connector_hidden_size) {
|
||||
if (cfg.use_audio_connector && a_context != nullptr && a_context->ne[0] == cfg.audio_connector_hidden_size) {
|
||||
auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["audio_embeddings_connector"]);
|
||||
a_context = connector->forward(ctx, a_context, audio_connector_pe);
|
||||
}
|
||||
|
||||
if (!cfg.caption_proj_before_connector &&
|
||||
cfg.use_caption_projection &&
|
||||
blocks.count("caption_projection") > 0 &&
|
||||
v_context != nullptr &&
|
||||
v_context->ne[0] == cfg.caption_channels) {
|
||||
if (cfg.use_caption_projection && v_context != nullptr && v_context->ne[0] == cfg.caption_channels) {
|
||||
auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["caption_projection"]);
|
||||
if (caption_projection != nullptr) {
|
||||
v_context = caption_projection->forward(ctx, v_context);
|
||||
}
|
||||
}
|
||||
if (process_audio_context &&
|
||||
!cfg.caption_proj_before_connector &&
|
||||
cfg.use_audio_caption_projection &&
|
||||
blocks.count("audio_caption_projection") > 0 &&
|
||||
a_context != nullptr &&
|
||||
a_context->ne[0] == cfg.caption_channels) {
|
||||
if (cfg.use_audio_caption_projection && a_context != nullptr && a_context->ne[0] == cfg.caption_channels) {
|
||||
auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["audio_caption_projection"]);
|
||||
if (caption_projection != nullptr) {
|
||||
a_context = caption_projection->forward(ctx, a_context);
|
||||
}
|
||||
}
|
||||
|
||||
return {v_context, a_context};
|
||||
}
|
||||
@ -1340,13 +1168,9 @@ namespace LTXV {
|
||||
ax = nullptr;
|
||||
}
|
||||
|
||||
bool run_ax = ax != nullptr && ggml_nelements(ax) > 0 && audio_time > 0;
|
||||
auto contexts = preprocess_contexts(ctx, context, video_connector_pe, audio_connector_pe, run_ax);
|
||||
auto contexts = preprocess_contexts(ctx, context, video_connector_pe, audio_connector_pe);
|
||||
auto v_context = contexts.first;
|
||||
auto a_context = contexts.second != nullptr ? contexts.second : contexts.first;
|
||||
if (contexts.second != nullptr) {
|
||||
a_context = ggml_cont(ctx->ggml_ctx, a_context);
|
||||
}
|
||||
|
||||
auto v_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, timestep, cfg.timestep_scale_multiplier);
|
||||
auto v_pair = adaln_single->forward(ctx, v_timestep_scaled);
|
||||
@ -1433,8 +1257,6 @@ namespace LTXV {
|
||||
std::vector<float> audio_cross_pe_vec;
|
||||
std::vector<float> connector_pe_vec;
|
||||
std::vector<float> audio_connector_pe_vec;
|
||||
sd::Tensor<float> vx_input_cache;
|
||||
sd::Tensor<float> ax_input_cache;
|
||||
|
||||
static int64_t infer_gate_heads(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& bias_name,
|
||||
@ -1566,7 +1388,7 @@ namespace LTXV {
|
||||
model.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
std::pair<sd::Tensor<float>, sd::Tensor<float>> split_av_latents(const sd::Tensor<float>& x_tensor,
|
||||
std::pair<sd::Tensor<float>, sd::Tensor<float>> separate_audio_and_video_latents(const sd::Tensor<float>& x_tensor,
|
||||
int audio_length) const {
|
||||
if (x_tensor.empty()) {
|
||||
return {{}, {}};
|
||||
@ -1602,7 +1424,7 @@ namespace LTXV {
|
||||
return {vx, ax};
|
||||
}
|
||||
|
||||
ggml_tensor* merge_av_latents(ggml_context* ctx,
|
||||
ggml_tensor* recombine_audio_and_video_latents(ggml_context* ctx,
|
||||
ggml_tensor* vx,
|
||||
ggml_tensor* ax) const {
|
||||
if (ax == nullptr || ggml_nelements(ax) == 0 || ax->ne[1] == 0) {
|
||||
@ -1633,16 +1455,12 @@ namespace LTXV {
|
||||
const sd::Tensor<float>& audio_x_tensor = {},
|
||||
const sd::Tensor<float>& audio_timesteps_tensor = {},
|
||||
int audio_length = 0) {
|
||||
auto split_inputs = split_av_latents(x_tensor, audio_length);
|
||||
vx_input_cache = split_inputs.first;
|
||||
if (!audio_x_tensor.empty()) {
|
||||
ax_input_cache = audio_x_tensor;
|
||||
} else {
|
||||
ax_input_cache = split_inputs.second;
|
||||
}
|
||||
auto split_inputs = separate_audio_and_video_latents(x_tensor, audio_length);
|
||||
const sd::Tensor<float>& vx_tensor = split_inputs.first;
|
||||
const sd::Tensor<float>& ax_tensor = !audio_x_tensor.empty() ? audio_x_tensor : split_inputs.second;
|
||||
|
||||
ggml_tensor* vx = make_input(vx_input_cache);
|
||||
ggml_tensor* ax = make_optional_input(ax_input_cache);
|
||||
ggml_tensor* vx = make_input(vx_tensor);
|
||||
ggml_tensor* ax = make_optional_input(ax_tensor);
|
||||
ggml_tensor* timesteps = make_input(timesteps_tensor);
|
||||
ggml_tensor* a_timestep = make_optional_input(audio_timesteps_tensor);
|
||||
ggml_tensor* context = make_optional_input(context_tensor);
|
||||
@ -1653,15 +1471,12 @@ namespace LTXV {
|
||||
vx->ne[1],
|
||||
vx->ne[2],
|
||||
static_cast<int>(params.hidden_size),
|
||||
static_cast<int>(params.num_attention_heads),
|
||||
24.f,
|
||||
25.f,
|
||||
params.positional_embedding_theta,
|
||||
params.positional_embedding_max_pos,
|
||||
params.vae_scale_factors,
|
||||
params.causal_temporal_positioning,
|
||||
params.use_middle_indices_grid);
|
||||
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.num_attention_heads);
|
||||
ggml_set_name(video_pe, "ltxav_video_pe");
|
||||
params.causal_temporal_positioning);
|
||||
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.hidden_size / 2, vx->ne[0] * vx->ne[1] * vx->ne[2]);
|
||||
set_backend_tensor_data(video_pe, video_pe_vec.data());
|
||||
|
||||
ggml_tensor* audio_pe = nullptr;
|
||||
@ -1670,12 +1485,10 @@ namespace LTXV {
|
||||
if (ax != nullptr && ggml_nelements(ax) > 0 && ax->ne[1] > 0) {
|
||||
audio_pe_vec = build_audio_rope_matrix(ax->ne[1],
|
||||
static_cast<int>(params.audio_hidden_size),
|
||||
static_cast<int>(params.audio_num_attention_heads),
|
||||
params.positional_embedding_theta,
|
||||
params.audio_positional_embedding_max_pos[0],
|
||||
params.use_middle_indices_grid);
|
||||
audio_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
|
||||
ggml_set_name(audio_pe, "ltxav_audio_pe");
|
||||
false);
|
||||
audio_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_hidden_size / 2, ax->ne[1]);
|
||||
set_backend_tensor_data(audio_pe, audio_pe_vec.data());
|
||||
|
||||
int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]);
|
||||
@ -1683,68 +1496,43 @@ namespace LTXV {
|
||||
vx->ne[1],
|
||||
vx->ne[2],
|
||||
static_cast<int>(params.audio_cross_attention_dim),
|
||||
static_cast<int>(params.audio_num_attention_heads),
|
||||
25.f,
|
||||
params.positional_embedding_theta,
|
||||
temporal_max_pos,
|
||||
std::get<0>(params.vae_scale_factors),
|
||||
params.causal_temporal_positioning,
|
||||
true);
|
||||
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.audio_num_attention_heads);
|
||||
ggml_set_name(video_cross_pe, "ltxav_video_cross_pe");
|
||||
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_cross_attention_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2]);
|
||||
set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data());
|
||||
|
||||
audio_cross_pe_vec = build_audio_rope_matrix(ax->ne[1],
|
||||
static_cast<int>(params.audio_cross_attention_dim),
|
||||
static_cast<int>(params.audio_num_attention_heads),
|
||||
params.positional_embedding_theta,
|
||||
temporal_max_pos,
|
||||
true);
|
||||
audio_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
|
||||
ggml_set_name(audio_cross_pe, "ltxav_audio_cross_pe");
|
||||
audio_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_cross_attention_dim / 2, ax->ne[1]);
|
||||
set_backend_tensor_data(audio_cross_pe, audio_cross_pe_vec.data());
|
||||
}
|
||||
|
||||
bool needs_video_connector_pe =
|
||||
params.use_connector &&
|
||||
context != nullptr &&
|
||||
(context->ne[0] == params.connector_hidden_size ||
|
||||
((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
|
||||
context->ne[0] == params.caption_channels * 2) &&
|
||||
context->ne[1] < 1024));
|
||||
ggml_tensor* video_connector_pe = nullptr;
|
||||
if (needs_video_connector_pe) {
|
||||
if (params.use_connector && context != nullptr && context->ne[0] == params.connector_hidden_size) {
|
||||
int64_t seq_len = context->ne[1];
|
||||
int64_t target_len = std::max<int64_t>(1024, seq_len);
|
||||
int64_t duplications = (target_len + params.connector_num_registers - 1) / params.connector_num_registers;
|
||||
int64_t full_len = seq_len + duplications * params.connector_num_registers - seq_len;
|
||||
connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.connector_hidden_size), static_cast<int>(params.connector_num_heads), 10000.f, 4096.f, true);
|
||||
video_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.connector_head_dim / 2, full_len * params.connector_num_heads);
|
||||
ggml_set_name(video_connector_pe, "ltxav_video_connector_pe");
|
||||
connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.connector_hidden_size));
|
||||
video_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.connector_hidden_size / 2, full_len);
|
||||
set_backend_tensor_data(video_connector_pe, connector_pe_vec.data());
|
||||
}
|
||||
|
||||
bool run_audio_context =
|
||||
ax != nullptr &&
|
||||
ggml_nelements(ax) > 0 &&
|
||||
ax->ne[1] > 0;
|
||||
bool needs_audio_connector_pe =
|
||||
run_audio_context &&
|
||||
params.use_audio_connector &&
|
||||
context != nullptr &&
|
||||
(context->ne[0] == params.audio_connector_hidden_size ||
|
||||
((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
|
||||
context->ne[0] == params.caption_channels * 2) &&
|
||||
context->ne[1] < 1024));
|
||||
ggml_tensor* audio_connector_pe = nullptr;
|
||||
if (needs_audio_connector_pe) {
|
||||
if (params.use_audio_connector && context != nullptr && context->ne[0] == params.audio_connector_hidden_size) {
|
||||
int64_t seq_len = context->ne[1];
|
||||
int64_t target_len = std::max<int64_t>(1024, seq_len);
|
||||
int64_t duplications = (target_len + params.audio_connector_num_registers - 1) / params.audio_connector_num_registers;
|
||||
int64_t full_len = seq_len + duplications * params.audio_connector_num_registers - seq_len;
|
||||
audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.audio_connector_hidden_size), static_cast<int>(params.audio_connector_num_heads), 10000.f, 4096.f, true);
|
||||
audio_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_connector_head_dim / 2, full_len * params.audio_connector_num_heads);
|
||||
ggml_set_name(audio_connector_pe, "ltxav_audio_connector_pe");
|
||||
audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.audio_connector_hidden_size));
|
||||
audio_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_connector_hidden_size / 2, full_len);
|
||||
set_backend_tensor_data(audio_connector_pe, audio_connector_pe_vec.data());
|
||||
}
|
||||
|
||||
@ -1761,7 +1549,7 @@ namespace LTXV {
|
||||
audio_cross_pe,
|
||||
video_connector_pe,
|
||||
audio_connector_pe);
|
||||
auto out = merge_av_latents(compute_ctx, out_pair.first, out_pair.second);
|
||||
auto out = recombine_audio_and_video_latents(compute_ctx, out_pair.first, out_pair.second);
|
||||
ggml_build_forward_expand(gf, out);
|
||||
return gf;
|
||||
}
|
||||
@ -1776,106 +1564,7 @@ namespace LTXV {
|
||||
auto get_graph = [&]() -> ggml_cgraph* {
|
||||
return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length);
|
||||
};
|
||||
auto out = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||
return out;
|
||||
}
|
||||
|
||||
void test(const std::string& x_path,
|
||||
const std::string& timesteps_path = "",
|
||||
const std::string& context_path = "",
|
||||
const std::string& audio_x_path = "",
|
||||
const std::string& audio_timesteps_path = "") {
|
||||
auto x = sd::load_tensor_from_file_as_tensor<float>(x_path);
|
||||
GGML_ASSERT(!x.empty());
|
||||
print_sd_tensor(x, false, "ltxav_x");
|
||||
|
||||
sd::Tensor<float> timesteps;
|
||||
if (!timesteps_path.empty()) {
|
||||
timesteps = sd::load_tensor_from_file_as_tensor<float>(timesteps_path);
|
||||
} else {
|
||||
timesteps = sd::Tensor<float>::from_vector(std::vector<float>{1.f});
|
||||
}
|
||||
GGML_ASSERT(!timesteps.empty());
|
||||
print_sd_tensor(timesteps, false, "ltxav_timesteps");
|
||||
|
||||
sd::Tensor<float> context;
|
||||
if (!context_path.empty()) {
|
||||
context = sd::load_tensor_from_file_as_tensor<float>(context_path);
|
||||
GGML_ASSERT(!context.empty());
|
||||
print_sd_tensor(context, false, "ltxav_context");
|
||||
}
|
||||
|
||||
sd::Tensor<float> audio_x;
|
||||
int audio_length = 0;
|
||||
if (!audio_x_path.empty()) {
|
||||
audio_x = sd::load_tensor_from_file_as_tensor<float>(audio_x_path);
|
||||
GGML_ASSERT(!audio_x.empty());
|
||||
GGML_ASSERT(audio_x.dim() >= 2);
|
||||
audio_length = static_cast<int>(audio_x.shape()[1]);
|
||||
print_sd_tensor(audio_x, false, "ltxav_audio_x");
|
||||
}
|
||||
|
||||
sd::Tensor<float> audio_timesteps;
|
||||
if (!audio_timesteps_path.empty()) {
|
||||
audio_timesteps = sd::load_tensor_from_file_as_tensor<float>(audio_timesteps_path);
|
||||
GGML_ASSERT(!audio_timesteps.empty());
|
||||
} else if (!audio_x.empty()) {
|
||||
audio_timesteps = timesteps;
|
||||
}
|
||||
if (!audio_timesteps.empty()) {
|
||||
print_sd_tensor(audio_timesteps, false, "ltxav_audio_timesteps");
|
||||
}
|
||||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
auto out_opt = compute(8, x, timesteps, context, audio_x, audio_timesteps, audio_length);
|
||||
int64_t t1 = ggml_time_ms();
|
||||
|
||||
GGML_ASSERT(!out_opt.empty());
|
||||
print_sd_tensor(out_opt, false, "ltxav_out");
|
||||
LOG_DEBUG("ltxav test done in %lldms", t1 - t0);
|
||||
}
|
||||
|
||||
static void load_from_file_and_test(const std::string& model_path,
|
||||
const std::string& x_path,
|
||||
const std::string& timesteps_path = "",
|
||||
const std::string& context_path = "",
|
||||
const std::string& embeddings_path = "",
|
||||
const std::string& audio_x_path = "",
|
||||
const std::string& audio_timesteps_path = "") {
|
||||
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||
ggml_backend_t backend = ggml_backend_cpu_init();
|
||||
LOG_INFO("loading ltxav from '%s'", model_path.c_str());
|
||||
|
||||
ModelLoader model_loader;
|
||||
if (!model_loader.init_from_file_and_convert_name(model_path, "model.diffusion_model.")) {
|
||||
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
|
||||
return;
|
||||
}
|
||||
if (!embeddings_path.empty()) {
|
||||
LOG_INFO("loading ltxav embeddings from '%s'", embeddings_path.c_str());
|
||||
if (!model_loader.init_from_file(embeddings_path)) {
|
||||
LOG_ERROR("init embeddings model loader from file failed: '%s'", embeddings_path.c_str());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
||||
std::shared_ptr<LTXAVRunner> ltxav = std::make_shared<LTXAVRunner>(backend,
|
||||
false,
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
|
||||
ltxav->alloc_params_buffer();
|
||||
std::map<std::string, ggml_tensor*> tensors;
|
||||
ltxav->get_param_tensors(tensors, "model.diffusion_model");
|
||||
|
||||
if (!model_loader.load_tensors(tensors)) {
|
||||
LOG_ERROR("load tensors from model loader failed");
|
||||
return;
|
||||
}
|
||||
|
||||
LOG_INFO("ltxav model loaded");
|
||||
ltxav->test(x_path, timesteps_path, context_path, audio_x_path, audio_timesteps_path);
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
#include "diffusion_model.hpp"
|
||||
#include "esrgan.hpp"
|
||||
#include "lora.hpp"
|
||||
#include "ltx_vae.hpp"
|
||||
#include "ltx_vae.h"
|
||||
#include "pmid.hpp"
|
||||
#include "sample-cache.h"
|
||||
#include "tae.hpp"
|
||||
@ -3742,8 +3742,8 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
|
||||
}
|
||||
|
||||
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
||||
latents.audio_length = 0;
|
||||
latents.audio_latent = {};
|
||||
latents.audio_length = get_ltxav_num_audio_latents(request->frames, request->fps);
|
||||
latents.audio_latent = sd::zeros<float>({16, latents.audio_length, 8, 1});
|
||||
}
|
||||
|
||||
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
||||
@ -3923,8 +3923,9 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
|
||||
latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
|
||||
}
|
||||
|
||||
// Pipeline-level audio support is temporarily disabled. Keep the model-side
|
||||
// AV implementation intact, but feed pure video latents through vid_gen.
|
||||
if (!latents.audio_latent.empty()) {
|
||||
latents.init_latent = pack_ltxav_audio_and_video_latents(latents.init_latent, latents.audio_latent);
|
||||
}
|
||||
|
||||
return latents;
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user