mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-05-08 16:28:53 +00:00
Compare commits
4 Commits
ca7e008d78
...
2ca782a65a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2ca782a65a | ||
|
|
d51f35bf63 | ||
|
|
0b65927b1b | ||
|
|
831b321c6a |
@ -19,8 +19,6 @@
|
|||||||
#include "common/media_io.h"
|
#include "common/media_io.h"
|
||||||
#include "common/resource_owners.hpp"
|
#include "common/resource_owners.hpp"
|
||||||
#include "image_metadata.h"
|
#include "image_metadata.h"
|
||||||
#include "llm.hpp"
|
|
||||||
#include "ltx_vae_test.h"
|
|
||||||
|
|
||||||
namespace fs = std::filesystem;
|
namespace fs = std::filesystem;
|
||||||
|
|
||||||
@ -502,27 +500,6 @@ int main(int argc, const char* argv[]) {
|
|||||||
SDContextParams ctx_params;
|
SDContextParams ctx_params;
|
||||||
SDGenerationParams gen_params;
|
SDGenerationParams gen_params;
|
||||||
|
|
||||||
cli_params.verbose = true;
|
|
||||||
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
|
||||||
{
|
|
||||||
const bool run_ltx_vae_test = false;
|
|
||||||
const std::string model_path = "E:/Code/ComfyUI/models/vae/ltx-2.3-22b-dev_video_vae.safetensors";
|
|
||||||
const std::string input_path = "E:/Code/sd.cpp/build/ltx_vae_z.bin";
|
|
||||||
if (run_ltx_vae_test) {
|
|
||||||
ltx_vae_load_from_file_and_test(model_path, input_path);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// cli_params.verbose = true;
|
|
||||||
// sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
|
||||||
// GemmaTokenizer tokenizer;
|
|
||||||
// auto tokens = tokenizer.tokenize("<html> 一只可爱的小猫");
|
|
||||||
// for (auto token : tokens) {
|
|
||||||
// LOG_INFO("%d", token);
|
|
||||||
// }
|
|
||||||
// return 0;
|
|
||||||
|
|
||||||
parse_args(argc, argv, cli_params, ctx_params, gen_params);
|
parse_args(argc, argv, cli_params, ctx_params, gen_params);
|
||||||
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
||||||
log_verbose = cli_params.verbose;
|
log_verbose = cli_params.verbose;
|
||||||
|
|||||||
@ -103,6 +103,64 @@ namespace DiT {
|
|||||||
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
|
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline ggml_tensor* patchify(ggml_context* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
int pt,
|
||||||
|
int ph,
|
||||||
|
int pw,
|
||||||
|
int64_t N = 1) {
|
||||||
|
// x: [N*C, T, H, W]
|
||||||
|
// return: [N, h*w, C*pt*ph*pw]
|
||||||
|
int64_t C = x->ne[3] / N;
|
||||||
|
int64_t T = x->ne[2];
|
||||||
|
int64_t H = x->ne[1];
|
||||||
|
int64_t W = x->ne[0];
|
||||||
|
int64_t t_len = T / pt;
|
||||||
|
int64_t h_len = H / ph;
|
||||||
|
int64_t w_len = W / pw;
|
||||||
|
|
||||||
|
GGML_ASSERT(C * N == x->ne[3]);
|
||||||
|
GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
|
||||||
|
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N); // [N*C*t_len, pt, h_len*ph, w_len*pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, h_len*ph, pt, w_len*pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N); // [N*C*t_len*h_len, ph, pt, w_len*pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt, ph, w_len*pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt*ph, w_len, pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, w_len, pt*ph, pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N); // [N, C, t_len*h_len*w_len, pt*ph*pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N, t_len*h_len*w_len, C, pt*ph*pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1); // [N, t_len*h_len*w_len, C*pt*ph*pw]
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline ggml_tensor* unpatchify(ggml_context* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
int64_t t_len,
|
||||||
|
int64_t h_len,
|
||||||
|
int64_t w_len,
|
||||||
|
int pt,
|
||||||
|
int ph,
|
||||||
|
int pw) {
|
||||||
|
// x: [N, t_len*h_len*w_len, pt*ph*pw*C]
|
||||||
|
// return: [N*C, t_len*pt, h_len*ph, w_len*pw]
|
||||||
|
int64_t N = x->ne[3];
|
||||||
|
int64_t C = x->ne[0] / pt / ph / pw;
|
||||||
|
|
||||||
|
GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
|
||||||
|
|
||||||
|
x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N); // [N, t_len*h_len*w_len, pt*ph*pw, C]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, t_len*h_len*w_len, pt*ph*pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N); // [N*C*t_len*h_len, w_len, pt*ph, pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt*ph, w_len, pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt, ph, w_len*pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, ph, pt, w_len*pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N); // [N*C*t_len, h_len*ph, pt, w_len*pw]
|
||||||
|
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, pt, h_len*ph, w_len*pw]
|
||||||
|
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N); // [N*C, t_len*pt, h_len*ph, w_len*pw]
|
||||||
|
return x;
|
||||||
|
}
|
||||||
} // namespace DiT
|
} // namespace DiT
|
||||||
|
|
||||||
#endif // __COMMON_DIT_HPP__
|
#endif // __COMMON_DIT_HPP__
|
||||||
|
|||||||
@ -1682,6 +1682,15 @@ struct GGMLRunnerContext {
|
|||||||
bool circular_x_enabled = false;
|
bool circular_x_enabled = false;
|
||||||
bool circular_y_enabled = false;
|
bool circular_y_enabled = false;
|
||||||
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
|
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
|
||||||
|
std::unordered_map<ggml_tensor*, std::string>* debug_tensors = nullptr;
|
||||||
|
|
||||||
|
void capture_tensor(const std::string& name, ggml_tensor* tensor) {
|
||||||
|
if (debug_tensors == nullptr || tensor == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ggml_set_output(tensor);
|
||||||
|
(*debug_tensors)[tensor] = name;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct GGMLRunner {
|
struct GGMLRunner {
|
||||||
@ -1713,6 +1722,7 @@ protected:
|
|||||||
|
|
||||||
std::map<ggml_tensor*, const void*> backend_tensor_data_map;
|
std::map<ggml_tensor*, const void*> backend_tensor_data_map;
|
||||||
std::map<std::string, ggml_tensor*> cache_tensor_map; // name -> tensor
|
std::map<std::string, ggml_tensor*> cache_tensor_map; // name -> tensor
|
||||||
|
std::unordered_map<ggml_tensor*, std::string> debug_tensors;
|
||||||
const std::string final_result_name = "ggml_runner_final_result_tensor";
|
const std::string final_result_name = "ggml_runner_final_result_tensor";
|
||||||
|
|
||||||
bool flash_attn_enabled = false;
|
bool flash_attn_enabled = false;
|
||||||
@ -1799,6 +1809,7 @@ protected:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void free_compute_ctx() {
|
void free_compute_ctx() {
|
||||||
|
debug_tensors.clear();
|
||||||
if (compute_ctx != nullptr) {
|
if (compute_ctx != nullptr) {
|
||||||
ggml_free(compute_ctx);
|
ggml_free(compute_ctx);
|
||||||
compute_ctx = nullptr;
|
compute_ctx = nullptr;
|
||||||
@ -1834,6 +1845,11 @@ protected:
|
|||||||
auto result = ggml_graph_node(gf, -1);
|
auto result = ggml_graph_node(gf, -1);
|
||||||
ggml_set_name(result, final_result_name.c_str());
|
ggml_set_name(result, final_result_name.c_str());
|
||||||
}
|
}
|
||||||
|
for (const auto& entry : debug_tensors) {
|
||||||
|
if (entry.first != nullptr) {
|
||||||
|
ggml_build_forward_expand(gf, entry.first);
|
||||||
|
}
|
||||||
|
}
|
||||||
prepare_build_in_tensor_after(gf);
|
prepare_build_in_tensor_after(gf);
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
@ -1903,6 +1919,21 @@ protected:
|
|||||||
for (auto& kv : backend_tensor_data_map) {
|
for (auto& kv : backend_tensor_data_map) {
|
||||||
auto tensor = kv.first;
|
auto tensor = kv.first;
|
||||||
auto data = kv.second;
|
auto data = kv.second;
|
||||||
|
if (tensor == nullptr || data == nullptr) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const char* name = ggml_get_name(tensor);
|
||||||
|
if (tensor->buffer == nullptr) {
|
||||||
|
LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
|
||||||
|
get_desc().c_str(),
|
||||||
|
name != nullptr ? name : "",
|
||||||
|
(long long)tensor->ne[0],
|
||||||
|
(long long)tensor->ne[1],
|
||||||
|
(long long)tensor->ne[2],
|
||||||
|
(long long)tensor->ne[3],
|
||||||
|
ggml_type_name(tensor->type));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
|
ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
|
||||||
}
|
}
|
||||||
@ -2025,6 +2056,7 @@ public:
|
|||||||
runner_ctx.circular_x_enabled = circular_x_enabled;
|
runner_ctx.circular_x_enabled = circular_x_enabled;
|
||||||
runner_ctx.circular_y_enabled = circular_y_enabled;
|
runner_ctx.circular_y_enabled = circular_y_enabled;
|
||||||
runner_ctx.weight_adapter = weight_adapter;
|
runner_ctx.weight_adapter = weight_adapter;
|
||||||
|
runner_ctx.debug_tensors = &debug_tensors;
|
||||||
return runner_ctx;
|
return runner_ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2163,6 +2195,21 @@ public:
|
|||||||
LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
|
LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
|
||||||
return std::nullopt;
|
return std::nullopt;
|
||||||
}
|
}
|
||||||
|
for (const auto& entry : debug_tensors) {
|
||||||
|
auto tensor = entry.first;
|
||||||
|
if (tensor == nullptr) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (tensor->type != GGML_TYPE_F32) {
|
||||||
|
LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s",
|
||||||
|
get_desc().c_str(),
|
||||||
|
entry.second.c_str(),
|
||||||
|
ggml_type_name(tensor->type));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto debug_tensor = sd::make_sd_tensor_from_ggml<float>(tensor);
|
||||||
|
print_sd_tensor(debug_tensor, false, entry.second.c_str());
|
||||||
|
}
|
||||||
copy_cache_tensors_to_cache_buffer();
|
copy_cache_tensors_to_cache_buffer();
|
||||||
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
|
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
|
||||||
std::optional<sd::Tensor<T>> output;
|
std::optional<sd::Tensor<T>> output;
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef __SD_LTX_VAE_H__
|
#ifndef __SD_LTX_VAE_HPP__
|
||||||
#define __SD_LTX_VAE_H__
|
#define __SD_LTX_VAE_HPP__
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
@ -936,7 +936,8 @@ struct LTXVideoVAE : public VAE {
|
|||||||
|
|
||||||
static void load_from_file_and_test(const std::string& model_path,
|
static void load_from_file_and_test(const std::string& model_path,
|
||||||
const std::string& input_path) {
|
const std::string& input_path) {
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(0);
|
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||||
|
ggml_backend_t backend = ggml_backend_cpu_init();
|
||||||
LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
|
LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
|
||||||
|
|
||||||
ModelLoader model_loader;
|
ModelLoader model_loader;
|
||||||
@ -967,4 +968,4 @@ struct LTXVideoVAE : public VAE {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // __SD_LTX_VAE_H__
|
#endif // __SD_LTX_VAE_HPP__
|
||||||
@ -1,8 +0,0 @@
|
|||||||
#include "ltx_vae_test.h"
|
|
||||||
|
|
||||||
#include "ltx_vae.h"
|
|
||||||
|
|
||||||
void ltx_vae_load_from_file_and_test(const std::string& model_path,
|
|
||||||
const std::string& input_path) {
|
|
||||||
LTXVideoVAE::load_from_file_and_test(model_path, input_path);
|
|
||||||
}
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
#ifndef __SD_LTX_VAE_TEST_H__
|
|
||||||
#define __SD_LTX_VAE_TEST_H__
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
void ltx_vae_load_from_file_and_test(const std::string& model_path,
|
|
||||||
const std::string& input_path);
|
|
||||||
|
|
||||||
#endif // __SD_LTX_VAE_TEST_H__
|
|
||||||
429
src/ltxv.hpp
429
src/ltxv.hpp
@ -1,5 +1,5 @@
|
|||||||
#ifndef __LTXV_HPP__
|
#ifndef __SD_LTXV_HPP__
|
||||||
#define __LTXV_HPP__
|
#define __SD_LTXV_HPP__
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -79,6 +79,30 @@ namespace LTXV {
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__STATIC_INLINE__ std::vector<double> generate_freq_grid_double(double theta,
|
||||||
|
int positional_dims,
|
||||||
|
int dim) {
|
||||||
|
const int n_elem = 2 * positional_dims;
|
||||||
|
const int freq_count = dim / n_elem;
|
||||||
|
|
||||||
|
std::vector<double> out(freq_count);
|
||||||
|
if (freq_count <= 0) {
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
if (freq_count == 1) {
|
||||||
|
out[0] = 1.5707963267948966;
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
const double half_pi = 1.5707963267948966;
|
||||||
|
const double log_theta = std::log(theta);
|
||||||
|
for (int i = 0; i < freq_count; i++) {
|
||||||
|
double ratio = static_cast<double>(i) / static_cast<double>(freq_count - 1);
|
||||||
|
out[i] = std::exp(log_theta * ratio) * half_pi;
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
__STATIC_INLINE__ std::vector<float> build_rope_matrix_from_frequencies(
|
__STATIC_INLINE__ std::vector<float> build_rope_matrix_from_frequencies(
|
||||||
const std::vector<std::vector<float>>& frequencies,
|
const std::vector<std::vector<float>>& frequencies,
|
||||||
int dim) {
|
int dim) {
|
||||||
@ -102,16 +126,43 @@ namespace LTXV {
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__STATIC_INLINE__ std::vector<std::vector<float>> split_frequencies_by_heads(
|
||||||
|
const std::vector<std::vector<float>>& frequencies,
|
||||||
|
int inner_dim,
|
||||||
|
int num_heads) {
|
||||||
|
GGML_ASSERT(num_heads > 0);
|
||||||
|
GGML_ASSERT(inner_dim % num_heads == 0);
|
||||||
|
const int inner_half_dim = inner_dim / 2;
|
||||||
|
const int per_head_half_dim = inner_half_dim / num_heads;
|
||||||
|
GGML_ASSERT(inner_half_dim % num_heads == 0);
|
||||||
|
|
||||||
|
std::vector<std::vector<float>> out(
|
||||||
|
frequencies.size() * static_cast<size_t>(num_heads),
|
||||||
|
std::vector<float>(per_head_half_dim, 0.f));
|
||||||
|
|
||||||
|
for (size_t token = 0; token < frequencies.size(); token++) {
|
||||||
|
GGML_ASSERT(static_cast<int>(frequencies[token].size()) == inner_half_dim);
|
||||||
|
for (int head = 0; head < num_heads; head++) {
|
||||||
|
auto& dst = out[token * static_cast<size_t>(num_heads) + static_cast<size_t>(head)];
|
||||||
|
std::copy_n(frequencies[token].begin() + head * per_head_half_dim, per_head_half_dim, dst.begin());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
__STATIC_INLINE__ std::vector<float> build_video_rope_matrix(int64_t width,
|
__STATIC_INLINE__ std::vector<float> build_video_rope_matrix(int64_t width,
|
||||||
int64_t height,
|
int64_t height,
|
||||||
int64_t frames,
|
int64_t frames,
|
||||||
int dim,
|
int dim,
|
||||||
|
int num_heads = 1,
|
||||||
float frame_rate = 25.f,
|
float frame_rate = 25.f,
|
||||||
float theta = 10000.f,
|
float theta = 10000.f,
|
||||||
const std::vector<int>& max_pos = {20, 2048, 2048},
|
const std::vector<int>& max_pos = {20, 2048, 2048},
|
||||||
const std::tuple<int, int, int>& vae_scale_factors = {8, 32, 32},
|
const std::tuple<int, int, int>& vae_scale_factors = {8, 32, 32},
|
||||||
bool causal_temporal_positioning = false) {
|
bool causal_temporal_positioning = false,
|
||||||
|
bool use_middle_indices_grid = false) {
|
||||||
GGML_ASSERT(max_pos.size() == 3);
|
GGML_ASSERT(max_pos.size() == 3);
|
||||||
|
GGML_ASSERT(dim % num_heads == 0);
|
||||||
const std::vector<float> indices = generate_freq_grid(theta, 3, dim);
|
const std::vector<float> indices = generate_freq_grid(theta, 3, dim);
|
||||||
const int half_dim = dim / 2;
|
const int half_dim = dim / 2;
|
||||||
const int pad_size = half_dim - static_cast<int>(indices.size()) * 3;
|
const int pad_size = half_dim - static_cast<int>(indices.size()) * 3;
|
||||||
@ -129,11 +180,25 @@ namespace LTXV {
|
|||||||
pixel_t = std::max(0.f, pixel_t + 1.f - scale_t);
|
pixel_t = std::max(0.f, pixel_t + 1.f - scale_t);
|
||||||
}
|
}
|
||||||
pixel_t /= frame_rate;
|
pixel_t /= frame_rate;
|
||||||
|
if (use_middle_indices_grid) {
|
||||||
|
float end = static_cast<float>((t + 1) * scale_t);
|
||||||
|
if (causal_temporal_positioning) {
|
||||||
|
end = std::max(0.f, end + 1.f - scale_t);
|
||||||
|
}
|
||||||
|
end /= frame_rate;
|
||||||
|
pixel_t = 0.5f * (pixel_t + end);
|
||||||
|
}
|
||||||
|
|
||||||
for (int64_t h = 0; h < height; h++) {
|
for (int64_t h = 0; h < height; h++) {
|
||||||
float pixel_h = static_cast<float>(h * scale_h);
|
float pixel_h = static_cast<float>(h * scale_h);
|
||||||
|
if (use_middle_indices_grid) {
|
||||||
|
pixel_h += 0.5f * static_cast<float>(scale_h);
|
||||||
|
}
|
||||||
for (int64_t w = 0; w < width; w++) {
|
for (int64_t w = 0; w < width; w++) {
|
||||||
float pixel_w = static_cast<float>(w * scale_w);
|
float pixel_w = static_cast<float>(w * scale_w);
|
||||||
|
if (use_middle_indices_grid) {
|
||||||
|
pixel_w += 0.5f * static_cast<float>(scale_w);
|
||||||
|
}
|
||||||
|
|
||||||
int out_idx = 0;
|
int out_idx = 0;
|
||||||
for (int i = 0; i < pad_size; i++) {
|
for (int i = 0; i < pad_size; i++) {
|
||||||
@ -146,13 +211,6 @@ namespace LTXV {
|
|||||||
pixel_w / max_pos[2],
|
pixel_w / max_pos[2],
|
||||||
};
|
};
|
||||||
|
|
||||||
// Match ComfyUI generate_freqs():
|
|
||||||
// (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
|
|
||||||
// .transpose(-1, -2)
|
|
||||||
// .flatten(2)
|
|
||||||
// After the transpose, the half-dim order is:
|
|
||||||
// [t_f0, h_f0, w_f0, t_f1, h_f1, w_f1, ...]
|
|
||||||
// not [t_f0, t_f1, ..., h_f0, h_f1, ..., w_f0, w_f1, ...].
|
|
||||||
for (float index : indices) {
|
for (float index : indices) {
|
||||||
for (int axis = 0; axis < 3; axis++) {
|
for (int axis = 0; axis < 3; axis++) {
|
||||||
freqs[token][out_idx++] = index * (coords[axis] * 2.f - 1.f);
|
freqs[token][out_idx++] = index * (coords[axis] * 2.f - 1.f);
|
||||||
@ -163,16 +221,24 @@ namespace LTXV {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (num_heads > 1) {
|
||||||
|
return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
|
||||||
|
}
|
||||||
return build_rope_matrix_from_frequencies(freqs, dim);
|
return build_rope_matrix_from_frequencies(freqs, dim);
|
||||||
}
|
}
|
||||||
|
|
||||||
__STATIC_INLINE__ std::vector<float> build_1d_rope_matrix(int64_t seq_len,
|
__STATIC_INLINE__ std::vector<float> build_1d_rope_matrix(int64_t seq_len,
|
||||||
int dim,
|
int dim,
|
||||||
|
int num_heads = 1,
|
||||||
float theta = 10000.f,
|
float theta = 10000.f,
|
||||||
float positional_scale = 4096.f) {
|
float positional_scale = 4096.f,
|
||||||
const std::vector<float> indices = generate_freq_grid(theta, 1, dim);
|
bool double_precision = false) {
|
||||||
|
GGML_ASSERT(dim % num_heads == 0);
|
||||||
|
const std::vector<float> indices = double_precision ? std::vector<float>() : generate_freq_grid(theta, 1, dim);
|
||||||
|
const std::vector<double> indices_d =
|
||||||
|
double_precision ? generate_freq_grid_double(static_cast<double>(theta), 1, dim) : std::vector<double>();
|
||||||
const int half_dim = dim / 2;
|
const int half_dim = dim / 2;
|
||||||
const int pad_size = half_dim - static_cast<int>(indices.size());
|
const int pad_size = half_dim - static_cast<int>(double_precision ? indices_d.size() : indices.size());
|
||||||
|
|
||||||
std::vector<std::vector<float>> freqs(static_cast<size_t>(seq_len), std::vector<float>(half_dim, 0.f));
|
std::vector<std::vector<float>> freqs(static_cast<size_t>(seq_len), std::vector<float>(half_dim, 0.f));
|
||||||
for (int64_t pos = 0; pos < seq_len; pos++) {
|
for (int64_t pos = 0; pos < seq_len; pos++) {
|
||||||
@ -181,20 +247,39 @@ namespace LTXV {
|
|||||||
freqs[static_cast<size_t>(pos)][out_idx++] = 0.f;
|
freqs[static_cast<size_t>(pos)][out_idx++] = 0.f;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (double_precision) {
|
||||||
|
double coord = static_cast<double>(pos) / static_cast<double>(positional_scale);
|
||||||
|
for (double index : indices_d) {
|
||||||
|
freqs[static_cast<size_t>(pos)][out_idx++] = static_cast<float>(index * (coord * 2.0 - 1.0));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
float coord = static_cast<float>(pos) / positional_scale;
|
float coord = static_cast<float>(pos) / positional_scale;
|
||||||
for (float index : indices) {
|
for (float index : indices) {
|
||||||
freqs[static_cast<size_t>(pos)][out_idx++] = index * (coord * 2.f - 1.f);
|
freqs[static_cast<size_t>(pos)][out_idx++] = index * (coord * 2.f - 1.f);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (num_heads > 1) {
|
||||||
|
return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
|
||||||
|
}
|
||||||
return build_rope_matrix_from_frequencies(freqs, dim);
|
return build_rope_matrix_from_frequencies(freqs, dim);
|
||||||
}
|
}
|
||||||
|
|
||||||
__STATIC_INLINE__ ggml_tensor* apply_hidden_rope(ggml_context* ctx,
|
__STATIC_INLINE__ ggml_tensor* apply_hidden_rope(ggml_context* ctx,
|
||||||
ggml_tensor* x,
|
ggml_tensor* x,
|
||||||
ggml_tensor* pe,
|
ggml_tensor* pe,
|
||||||
|
int64_t heads,
|
||||||
|
int64_t dim_head,
|
||||||
bool rope_interleaved) {
|
bool rope_interleaved) {
|
||||||
auto x4 = ggml_reshape_4d(ctx, x, x->ne[0], 1, x->ne[1], x->ne[2]);
|
GGML_ASSERT(x->ne[0] == heads * dim_head);
|
||||||
|
auto x4 = ggml_reshape_4d(ctx, x, dim_head, heads, x->ne[1], x->ne[2]);
|
||||||
|
if (pe != nullptr && pe->ne[3] == x->ne[1] * heads) {
|
||||||
|
auto x_flat = ggml_reshape_4d(ctx, x4, dim_head, 1, x->ne[1] * heads, x->ne[2]);
|
||||||
|
auto out_flat = Rope::apply_rope(ctx, x_flat, pe, rope_interleaved);
|
||||||
|
auto out4 = ggml_reshape_4d(ctx, out_flat, dim_head, heads, x->ne[1], x->ne[2]);
|
||||||
|
return ggml_reshape_3d(ctx, out4, heads * dim_head, x->ne[1], x->ne[2]);
|
||||||
|
}
|
||||||
return Rope::apply_rope(ctx, x4, pe, rope_interleaved);
|
return Rope::apply_rope(ctx, x4, pe, rope_interleaved);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -338,8 +423,8 @@ namespace LTXV {
|
|||||||
if (k_pe == nullptr) {
|
if (k_pe == nullptr) {
|
||||||
k_pe = pe;
|
k_pe = pe;
|
||||||
}
|
}
|
||||||
q = apply_hidden_rope(ctx->ggml_ctx, q, pe, rope_interleaved);
|
q = apply_hidden_rope(ctx->ggml_ctx, q, pe, heads, dim_head, rope_interleaved);
|
||||||
k = apply_hidden_rope(ctx->ggml_ctx, k, k_pe, rope_interleaved);
|
k = apply_hidden_rope(ctx->ggml_ctx, k, k_pe, heads, dim_head, rope_interleaved);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto out = ggml_ext_attention_ext(ctx->ggml_ctx,
|
auto out = ggml_ext_attention_ext(ctx->ggml_ctx,
|
||||||
@ -415,7 +500,7 @@ namespace LTXV {
|
|||||||
s = ggml_repeat(ctx->ggml_ctx, s, e);
|
s = ggml_repeat(ctx->ggml_ctx, s, e);
|
||||||
t = ggml_repeat(ctx->ggml_ctx, t, e);
|
t = ggml_repeat(ctx->ggml_ctx, t, e);
|
||||||
auto out = ggml_add(ctx->ggml_ctx, s, t);
|
auto out = ggml_add(ctx->ggml_ctx, s, t);
|
||||||
return ggml_ext_chunk(ctx->ggml_ctx, out, coeff, 1);
|
return ggml_ext_chunk(ctx->ggml_ctx, out, static_cast<int>(coeff), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<ggml_tensor*> get_prompt_scale_shift_values(GGMLRunnerContext* ctx,
|
std::vector<ggml_tensor*> get_prompt_scale_shift_values(GGMLRunnerContext* ctx,
|
||||||
@ -609,7 +694,7 @@ namespace LTXV {
|
|||||||
float positional_embedding_theta = 10000.f;
|
float positional_embedding_theta = 10000.f;
|
||||||
std::vector<int> positional_embedding_max_pos = {20, 2048, 2048};
|
std::vector<int> positional_embedding_max_pos = {20, 2048, 2048};
|
||||||
std::tuple<int, int, int> vae_scale_factors = {8, 32, 32};
|
std::tuple<int, int, int> vae_scale_factors = {8, 32, 32};
|
||||||
bool causal_temporal_positioning = false;
|
bool causal_temporal_positioning = true;
|
||||||
float timestep_scale_multiplier = 1000.f;
|
float timestep_scale_multiplier = 1000.f;
|
||||||
|
|
||||||
int64_t audio_in_channels = 128;
|
int64_t audio_in_channels = 128;
|
||||||
@ -641,11 +726,14 @@ namespace LTXV {
|
|||||||
bool audio_connector_rope_interleaved = false;
|
bool audio_connector_rope_interleaved = false;
|
||||||
bool audio_connector_apply_gated_attention = false;
|
bool audio_connector_apply_gated_attention = false;
|
||||||
|
|
||||||
bool video_rope_interleaved = true;
|
bool video_rope_interleaved = false;
|
||||||
|
bool use_middle_indices_grid = true;
|
||||||
bool cross_attention_adaln = false;
|
bool cross_attention_adaln = false;
|
||||||
|
|
||||||
bool use_caption_projection = true;
|
bool use_caption_projection = true;
|
||||||
bool use_audio_caption_projection = true;
|
bool use_audio_caption_projection = true;
|
||||||
|
bool caption_proj_before_connector = true;
|
||||||
|
bool caption_projection_first_linear = false;
|
||||||
|
|
||||||
bool self_attention_gated = false;
|
bool self_attention_gated = false;
|
||||||
bool cross_attention_gated = false;
|
bool cross_attention_gated = false;
|
||||||
@ -670,11 +758,16 @@ namespace LTXV {
|
|||||||
|
|
||||||
__STATIC_INLINE__ std::vector<float> build_1d_rope_matrix_from_coords(const std::vector<float>& coords,
|
__STATIC_INLINE__ std::vector<float> build_1d_rope_matrix_from_coords(const std::vector<float>& coords,
|
||||||
int dim,
|
int dim,
|
||||||
|
int num_heads = 1,
|
||||||
float theta = 10000.f,
|
float theta = 10000.f,
|
||||||
float max_pos = 20.f) {
|
float max_pos = 20.f,
|
||||||
const std::vector<float> indices = generate_freq_grid(theta, 1, dim);
|
bool double_precision = false) {
|
||||||
|
GGML_ASSERT(dim % num_heads == 0);
|
||||||
|
const std::vector<float> indices = double_precision ? std::vector<float>() : generate_freq_grid(theta, 1, dim);
|
||||||
|
const std::vector<double> indices_d =
|
||||||
|
double_precision ? generate_freq_grid_double(static_cast<double>(theta), 1, dim) : std::vector<double>();
|
||||||
const int half_dim = dim / 2;
|
const int half_dim = dim / 2;
|
||||||
const int pad_size = half_dim - static_cast<int>(indices.size());
|
const int pad_size = half_dim - static_cast<int>(double_precision ? indices_d.size() : indices.size());
|
||||||
|
|
||||||
std::vector<std::vector<float>> freqs(coords.size(), std::vector<float>(half_dim, 0.f));
|
std::vector<std::vector<float>> freqs(coords.size(), std::vector<float>(half_dim, 0.f));
|
||||||
for (size_t pos = 0; pos < coords.size(); pos++) {
|
for (size_t pos = 0; pos < coords.size(); pos++) {
|
||||||
@ -682,11 +775,21 @@ namespace LTXV {
|
|||||||
for (int i = 0; i < pad_size; i++) {
|
for (int i = 0; i < pad_size; i++) {
|
||||||
freqs[pos][out_idx++] = 0.f;
|
freqs[pos][out_idx++] = 0.f;
|
||||||
}
|
}
|
||||||
|
if (double_precision) {
|
||||||
|
double coord = static_cast<double>(coords[pos]) / static_cast<double>(max_pos);
|
||||||
|
for (double index : indices_d) {
|
||||||
|
freqs[pos][out_idx++] = static_cast<float>(index * (coord * 2.0 - 1.0));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
float coord = coords[pos] / max_pos;
|
float coord = coords[pos] / max_pos;
|
||||||
for (float index : indices) {
|
for (float index : indices) {
|
||||||
freqs[pos][out_idx++] = index * (coord * 2.f - 1.f);
|
freqs[pos][out_idx++] = index * (coord * 2.f - 1.f);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (num_heads > 1) {
|
||||||
|
return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
|
||||||
|
}
|
||||||
return build_rope_matrix_from_frequencies(freqs, dim);
|
return build_rope_matrix_from_frequencies(freqs, dim);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -705,6 +808,7 @@ namespace LTXV {
|
|||||||
int64_t height,
|
int64_t height,
|
||||||
int64_t frames,
|
int64_t frames,
|
||||||
int dim,
|
int dim,
|
||||||
|
int num_heads,
|
||||||
float frame_rate,
|
float frame_rate,
|
||||||
float theta,
|
float theta,
|
||||||
int max_pos_t,
|
int max_pos_t,
|
||||||
@ -725,7 +829,7 @@ namespace LTXV {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return build_1d_rope_matrix_from_coords(coords, dim, theta, static_cast<float>(max_pos_t));
|
return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
__STATIC_INLINE__ float audio_latent_start_time_sec(int64_t latent_index,
|
__STATIC_INLINE__ float audio_latent_start_time_sec(int64_t latent_index,
|
||||||
@ -742,6 +846,7 @@ namespace LTXV {
|
|||||||
|
|
||||||
__STATIC_INLINE__ std::vector<float> build_audio_rope_matrix(int64_t seq_len,
|
__STATIC_INLINE__ std::vector<float> build_audio_rope_matrix(int64_t seq_len,
|
||||||
int dim,
|
int dim,
|
||||||
|
int num_heads,
|
||||||
float theta = 10000.f,
|
float theta = 10000.f,
|
||||||
int max_pos_t = 20,
|
int max_pos_t = 20,
|
||||||
bool use_middle_indices_grid = false) {
|
bool use_middle_indices_grid = false) {
|
||||||
@ -755,7 +860,7 @@ namespace LTXV {
|
|||||||
coords[static_cast<size_t>(t)] = start;
|
coords[static_cast<size_t>(t)] = start;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return build_1d_rope_matrix_from_coords(coords, dim, theta, static_cast<float>(max_pos_t));
|
return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct BasicAVTransformerBlock : public GGMLBlock {
|
struct BasicAVTransformerBlock : public GGMLBlock {
|
||||||
@ -825,7 +930,7 @@ namespace LTXV {
|
|||||||
t = ggml_repeat(ctx->ggml_ctx, t, e);
|
t = ggml_repeat(ctx->ggml_ctx, t, e);
|
||||||
s = ggml_repeat(ctx->ggml_ctx, s, e);
|
s = ggml_repeat(ctx->ggml_ctx, s, e);
|
||||||
auto out = ggml_add(ctx->ggml_ctx, s, t);
|
auto out = ggml_add(ctx->ggml_ctx, s, t);
|
||||||
auto chunks = ggml_ext_chunk(ctx->ggml_ctx, out, coeff, 1);
|
auto chunks = ggml_ext_chunk(ctx->ggml_ctx, out, static_cast<int>(coeff), 1);
|
||||||
return std::vector<ggml_tensor*>(chunks.begin() + start, chunks.begin() + start + count);
|
return std::vector<ggml_tensor*>(chunks.begin() + start, chunks.begin() + start + count);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1004,11 +1109,23 @@ namespace LTXV {
|
|||||||
blocks["av_ca_v2a_gate_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, 1);
|
blocks["av_ca_v2a_gate_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, 1);
|
||||||
|
|
||||||
if (cfg.use_caption_projection) {
|
if (cfg.use_caption_projection) {
|
||||||
|
if (cfg.caption_proj_before_connector) {
|
||||||
|
if (cfg.caption_projection_first_linear) {
|
||||||
|
blocks["caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.hidden_size);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
blocks["caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.hidden_size, cfg.hidden_size);
|
blocks["caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.hidden_size, cfg.hidden_size);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
if (cfg.use_audio_caption_projection) {
|
if (cfg.use_audio_caption_projection) {
|
||||||
|
if (cfg.caption_proj_before_connector) {
|
||||||
|
if (cfg.caption_projection_first_linear) {
|
||||||
|
blocks["audio_caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.audio_hidden_size);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
blocks["audio_caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.audio_hidden_size, cfg.audio_hidden_size);
|
blocks["audio_caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.audio_hidden_size, cfg.audio_hidden_size);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (cfg.use_connector) {
|
if (cfg.use_connector) {
|
||||||
blocks["video_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(cfg.connector_hidden_size,
|
blocks["video_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(cfg.connector_hidden_size,
|
||||||
@ -1080,42 +1197,97 @@ namespace LTXV {
|
|||||||
std::pair<ggml_tensor*, ggml_tensor*> preprocess_contexts(GGMLRunnerContext* ctx,
|
std::pair<ggml_tensor*, ggml_tensor*> preprocess_contexts(GGMLRunnerContext* ctx,
|
||||||
ggml_tensor* context,
|
ggml_tensor* context,
|
||||||
ggml_tensor* video_connector_pe,
|
ggml_tensor* video_connector_pe,
|
||||||
ggml_tensor* audio_connector_pe) {
|
ggml_tensor* audio_connector_pe,
|
||||||
|
bool process_audio_context) {
|
||||||
if (context == nullptr) {
|
if (context == nullptr) {
|
||||||
return {nullptr, nullptr};
|
return {nullptr, nullptr};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim) {
|
bool is_fully_processed_context =
|
||||||
return {
|
context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
|
||||||
ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim),
|
context->ne[1] >= 1024;
|
||||||
ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim)
|
bool is_unprocessed_dual_context =
|
||||||
};
|
context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
|
||||||
|
context->ne[1] < 1024;
|
||||||
|
|
||||||
|
if (is_fully_processed_context) {
|
||||||
|
auto v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
|
||||||
|
ggml_tensor* a_context = nullptr;
|
||||||
|
if (process_audio_context) {
|
||||||
|
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
|
||||||
|
}
|
||||||
|
return {v_context, a_context};
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* v_context = context;
|
ggml_tensor* v_context = context;
|
||||||
ggml_tensor* a_context = context;
|
ggml_tensor* a_context = process_audio_context ? context : nullptr;
|
||||||
if (context->ne[0] == cfg.caption_channels * 2) {
|
if (is_unprocessed_dual_context) {
|
||||||
|
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
|
||||||
|
if (process_audio_context) {
|
||||||
|
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
|
||||||
|
}
|
||||||
|
} else if (context->ne[0] == cfg.caption_channels * 2) {
|
||||||
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.caption_channels);
|
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.caption_channels);
|
||||||
|
if (process_audio_context) {
|
||||||
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.caption_channels, cfg.caption_channels * 2);
|
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.caption_channels, cfg.caption_channels * 2);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cfg.caption_proj_before_connector) {
|
||||||
|
if (cfg.use_caption_projection &&
|
||||||
|
blocks.count("caption_projection") > 0 &&
|
||||||
|
v_context != nullptr &&
|
||||||
|
v_context->ne[0] == cfg.caption_channels) {
|
||||||
|
auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["caption_projection"]);
|
||||||
|
if (caption_projection != nullptr) {
|
||||||
|
v_context = caption_projection->forward(ctx, v_context);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (process_audio_context &&
|
||||||
|
cfg.use_audio_caption_projection &&
|
||||||
|
blocks.count("audio_caption_projection") > 0 &&
|
||||||
|
a_context != nullptr &&
|
||||||
|
a_context->ne[0] == cfg.caption_channels) {
|
||||||
|
auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["audio_caption_projection"]);
|
||||||
|
if (caption_projection != nullptr) {
|
||||||
|
a_context = caption_projection->forward(ctx, a_context);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (cfg.use_connector && v_context != nullptr && v_context->ne[0] == cfg.connector_hidden_size) {
|
if (cfg.use_connector && v_context != nullptr && v_context->ne[0] == cfg.connector_hidden_size) {
|
||||||
auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["video_embeddings_connector"]);
|
auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["video_embeddings_connector"]);
|
||||||
v_context = connector->forward(ctx, v_context, video_connector_pe);
|
v_context = connector->forward(ctx, v_context, video_connector_pe);
|
||||||
}
|
}
|
||||||
if (cfg.use_audio_connector && a_context != nullptr && a_context->ne[0] == cfg.audio_connector_hidden_size) {
|
if (process_audio_context &&
|
||||||
|
cfg.use_audio_connector &&
|
||||||
|
a_context != nullptr &&
|
||||||
|
a_context->ne[0] == cfg.audio_connector_hidden_size) {
|
||||||
auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["audio_embeddings_connector"]);
|
auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["audio_embeddings_connector"]);
|
||||||
a_context = connector->forward(ctx, a_context, audio_connector_pe);
|
a_context = connector->forward(ctx, a_context, audio_connector_pe);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cfg.use_caption_projection && v_context != nullptr && v_context->ne[0] == cfg.caption_channels) {
|
if (!cfg.caption_proj_before_connector &&
|
||||||
|
cfg.use_caption_projection &&
|
||||||
|
blocks.count("caption_projection") > 0 &&
|
||||||
|
v_context != nullptr &&
|
||||||
|
v_context->ne[0] == cfg.caption_channels) {
|
||||||
auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["caption_projection"]);
|
auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["caption_projection"]);
|
||||||
|
if (caption_projection != nullptr) {
|
||||||
v_context = caption_projection->forward(ctx, v_context);
|
v_context = caption_projection->forward(ctx, v_context);
|
||||||
}
|
}
|
||||||
if (cfg.use_audio_caption_projection && a_context != nullptr && a_context->ne[0] == cfg.caption_channels) {
|
}
|
||||||
|
if (process_audio_context &&
|
||||||
|
!cfg.caption_proj_before_connector &&
|
||||||
|
cfg.use_audio_caption_projection &&
|
||||||
|
blocks.count("audio_caption_projection") > 0 &&
|
||||||
|
a_context != nullptr &&
|
||||||
|
a_context->ne[0] == cfg.caption_channels) {
|
||||||
auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["audio_caption_projection"]);
|
auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["audio_caption_projection"]);
|
||||||
|
if (caption_projection != nullptr) {
|
||||||
a_context = caption_projection->forward(ctx, a_context);
|
a_context = caption_projection->forward(ctx, a_context);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return {v_context, a_context};
|
return {v_context, a_context};
|
||||||
}
|
}
|
||||||
@ -1168,9 +1340,13 @@ namespace LTXV {
|
|||||||
ax = nullptr;
|
ax = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto contexts = preprocess_contexts(ctx, context, video_connector_pe, audio_connector_pe);
|
bool run_ax = ax != nullptr && ggml_nelements(ax) > 0 && audio_time > 0;
|
||||||
|
auto contexts = preprocess_contexts(ctx, context, video_connector_pe, audio_connector_pe, run_ax);
|
||||||
auto v_context = contexts.first;
|
auto v_context = contexts.first;
|
||||||
auto a_context = contexts.second != nullptr ? contexts.second : contexts.first;
|
auto a_context = contexts.second != nullptr ? contexts.second : contexts.first;
|
||||||
|
if (contexts.second != nullptr) {
|
||||||
|
a_context = ggml_cont(ctx->ggml_ctx, a_context);
|
||||||
|
}
|
||||||
|
|
||||||
auto v_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, timestep, cfg.timestep_scale_multiplier);
|
auto v_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, timestep, cfg.timestep_scale_multiplier);
|
||||||
auto v_pair = adaln_single->forward(ctx, v_timestep_scaled);
|
auto v_pair = adaln_single->forward(ctx, v_timestep_scaled);
|
||||||
@ -1257,6 +1433,8 @@ namespace LTXV {
|
|||||||
std::vector<float> audio_cross_pe_vec;
|
std::vector<float> audio_cross_pe_vec;
|
||||||
std::vector<float> connector_pe_vec;
|
std::vector<float> connector_pe_vec;
|
||||||
std::vector<float> audio_connector_pe_vec;
|
std::vector<float> audio_connector_pe_vec;
|
||||||
|
sd::Tensor<float> vx_input_cache;
|
||||||
|
sd::Tensor<float> ax_input_cache;
|
||||||
|
|
||||||
static int64_t infer_gate_heads(const String2TensorStorage& tensor_storage_map,
|
static int64_t infer_gate_heads(const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string& bias_name,
|
const std::string& bias_name,
|
||||||
@ -1388,7 +1566,7 @@ namespace LTXV {
|
|||||||
model.get_param_tensors(tensors, prefix);
|
model.get_param_tensors(tensors, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<sd::Tensor<float>, sd::Tensor<float>> separate_audio_and_video_latents(const sd::Tensor<float>& x_tensor,
|
std::pair<sd::Tensor<float>, sd::Tensor<float>> split_av_latents(const sd::Tensor<float>& x_tensor,
|
||||||
int audio_length) const {
|
int audio_length) const {
|
||||||
if (x_tensor.empty()) {
|
if (x_tensor.empty()) {
|
||||||
return {{}, {}};
|
return {{}, {}};
|
||||||
@ -1424,7 +1602,7 @@ namespace LTXV {
|
|||||||
return {vx, ax};
|
return {vx, ax};
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* recombine_audio_and_video_latents(ggml_context* ctx,
|
ggml_tensor* merge_av_latents(ggml_context* ctx,
|
||||||
ggml_tensor* vx,
|
ggml_tensor* vx,
|
||||||
ggml_tensor* ax) const {
|
ggml_tensor* ax) const {
|
||||||
if (ax == nullptr || ggml_nelements(ax) == 0 || ax->ne[1] == 0) {
|
if (ax == nullptr || ggml_nelements(ax) == 0 || ax->ne[1] == 0) {
|
||||||
@ -1455,12 +1633,16 @@ namespace LTXV {
|
|||||||
const sd::Tensor<float>& audio_x_tensor = {},
|
const sd::Tensor<float>& audio_x_tensor = {},
|
||||||
const sd::Tensor<float>& audio_timesteps_tensor = {},
|
const sd::Tensor<float>& audio_timesteps_tensor = {},
|
||||||
int audio_length = 0) {
|
int audio_length = 0) {
|
||||||
auto split_inputs = separate_audio_and_video_latents(x_tensor, audio_length);
|
auto split_inputs = split_av_latents(x_tensor, audio_length);
|
||||||
const sd::Tensor<float>& vx_tensor = split_inputs.first;
|
vx_input_cache = split_inputs.first;
|
||||||
const sd::Tensor<float>& ax_tensor = !audio_x_tensor.empty() ? audio_x_tensor : split_inputs.second;
|
if (!audio_x_tensor.empty()) {
|
||||||
|
ax_input_cache = audio_x_tensor;
|
||||||
|
} else {
|
||||||
|
ax_input_cache = split_inputs.second;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor* vx = make_input(vx_tensor);
|
ggml_tensor* vx = make_input(vx_input_cache);
|
||||||
ggml_tensor* ax = make_optional_input(ax_tensor);
|
ggml_tensor* ax = make_optional_input(ax_input_cache);
|
||||||
ggml_tensor* timesteps = make_input(timesteps_tensor);
|
ggml_tensor* timesteps = make_input(timesteps_tensor);
|
||||||
ggml_tensor* a_timestep = make_optional_input(audio_timesteps_tensor);
|
ggml_tensor* a_timestep = make_optional_input(audio_timesteps_tensor);
|
||||||
ggml_tensor* context = make_optional_input(context_tensor);
|
ggml_tensor* context = make_optional_input(context_tensor);
|
||||||
@ -1471,12 +1653,15 @@ namespace LTXV {
|
|||||||
vx->ne[1],
|
vx->ne[1],
|
||||||
vx->ne[2],
|
vx->ne[2],
|
||||||
static_cast<int>(params.hidden_size),
|
static_cast<int>(params.hidden_size),
|
||||||
25.f,
|
static_cast<int>(params.num_attention_heads),
|
||||||
|
24.f,
|
||||||
params.positional_embedding_theta,
|
params.positional_embedding_theta,
|
||||||
params.positional_embedding_max_pos,
|
params.positional_embedding_max_pos,
|
||||||
params.vae_scale_factors,
|
params.vae_scale_factors,
|
||||||
params.causal_temporal_positioning);
|
params.causal_temporal_positioning,
|
||||||
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.hidden_size / 2, vx->ne[0] * vx->ne[1] * vx->ne[2]);
|
params.use_middle_indices_grid);
|
||||||
|
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.num_attention_heads);
|
||||||
|
ggml_set_name(video_pe, "ltxav_video_pe");
|
||||||
set_backend_tensor_data(video_pe, video_pe_vec.data());
|
set_backend_tensor_data(video_pe, video_pe_vec.data());
|
||||||
|
|
||||||
ggml_tensor* audio_pe = nullptr;
|
ggml_tensor* audio_pe = nullptr;
|
||||||
@ -1485,10 +1670,12 @@ namespace LTXV {
|
|||||||
if (ax != nullptr && ggml_nelements(ax) > 0 && ax->ne[1] > 0) {
|
if (ax != nullptr && ggml_nelements(ax) > 0 && ax->ne[1] > 0) {
|
||||||
audio_pe_vec = build_audio_rope_matrix(ax->ne[1],
|
audio_pe_vec = build_audio_rope_matrix(ax->ne[1],
|
||||||
static_cast<int>(params.audio_hidden_size),
|
static_cast<int>(params.audio_hidden_size),
|
||||||
|
static_cast<int>(params.audio_num_attention_heads),
|
||||||
params.positional_embedding_theta,
|
params.positional_embedding_theta,
|
||||||
params.audio_positional_embedding_max_pos[0],
|
params.audio_positional_embedding_max_pos[0],
|
||||||
false);
|
params.use_middle_indices_grid);
|
||||||
audio_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_hidden_size / 2, ax->ne[1]);
|
audio_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
|
||||||
|
ggml_set_name(audio_pe, "ltxav_audio_pe");
|
||||||
set_backend_tensor_data(audio_pe, audio_pe_vec.data());
|
set_backend_tensor_data(audio_pe, audio_pe_vec.data());
|
||||||
|
|
||||||
int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]);
|
int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]);
|
||||||
@ -1496,43 +1683,68 @@ namespace LTXV {
|
|||||||
vx->ne[1],
|
vx->ne[1],
|
||||||
vx->ne[2],
|
vx->ne[2],
|
||||||
static_cast<int>(params.audio_cross_attention_dim),
|
static_cast<int>(params.audio_cross_attention_dim),
|
||||||
|
static_cast<int>(params.audio_num_attention_heads),
|
||||||
25.f,
|
25.f,
|
||||||
params.positional_embedding_theta,
|
params.positional_embedding_theta,
|
||||||
temporal_max_pos,
|
temporal_max_pos,
|
||||||
std::get<0>(params.vae_scale_factors),
|
std::get<0>(params.vae_scale_factors),
|
||||||
params.causal_temporal_positioning,
|
params.causal_temporal_positioning,
|
||||||
true);
|
true);
|
||||||
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_cross_attention_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2]);
|
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.audio_num_attention_heads);
|
||||||
|
ggml_set_name(video_cross_pe, "ltxav_video_cross_pe");
|
||||||
set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data());
|
set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data());
|
||||||
|
|
||||||
audio_cross_pe_vec = build_audio_rope_matrix(ax->ne[1],
|
audio_cross_pe_vec = build_audio_rope_matrix(ax->ne[1],
|
||||||
static_cast<int>(params.audio_cross_attention_dim),
|
static_cast<int>(params.audio_cross_attention_dim),
|
||||||
|
static_cast<int>(params.audio_num_attention_heads),
|
||||||
params.positional_embedding_theta,
|
params.positional_embedding_theta,
|
||||||
temporal_max_pos,
|
temporal_max_pos,
|
||||||
true);
|
true);
|
||||||
audio_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_cross_attention_dim / 2, ax->ne[1]);
|
audio_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
|
||||||
|
ggml_set_name(audio_cross_pe, "ltxav_audio_cross_pe");
|
||||||
set_backend_tensor_data(audio_cross_pe, audio_cross_pe_vec.data());
|
set_backend_tensor_data(audio_cross_pe, audio_cross_pe_vec.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool needs_video_connector_pe =
|
||||||
|
params.use_connector &&
|
||||||
|
context != nullptr &&
|
||||||
|
(context->ne[0] == params.connector_hidden_size ||
|
||||||
|
((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
|
||||||
|
context->ne[0] == params.caption_channels * 2) &&
|
||||||
|
context->ne[1] < 1024));
|
||||||
ggml_tensor* video_connector_pe = nullptr;
|
ggml_tensor* video_connector_pe = nullptr;
|
||||||
if (params.use_connector && context != nullptr && context->ne[0] == params.connector_hidden_size) {
|
if (needs_video_connector_pe) {
|
||||||
int64_t seq_len = context->ne[1];
|
int64_t seq_len = context->ne[1];
|
||||||
int64_t target_len = std::max<int64_t>(1024, seq_len);
|
int64_t target_len = std::max<int64_t>(1024, seq_len);
|
||||||
int64_t duplications = (target_len + params.connector_num_registers - 1) / params.connector_num_registers;
|
int64_t duplications = (target_len + params.connector_num_registers - 1) / params.connector_num_registers;
|
||||||
int64_t full_len = seq_len + duplications * params.connector_num_registers - seq_len;
|
int64_t full_len = seq_len + duplications * params.connector_num_registers - seq_len;
|
||||||
connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.connector_hidden_size));
|
connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.connector_hidden_size), static_cast<int>(params.connector_num_heads), 10000.f, 4096.f, true);
|
||||||
video_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.connector_hidden_size / 2, full_len);
|
video_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.connector_head_dim / 2, full_len * params.connector_num_heads);
|
||||||
|
ggml_set_name(video_connector_pe, "ltxav_video_connector_pe");
|
||||||
set_backend_tensor_data(video_connector_pe, connector_pe_vec.data());
|
set_backend_tensor_data(video_connector_pe, connector_pe_vec.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool run_audio_context =
|
||||||
|
ax != nullptr &&
|
||||||
|
ggml_nelements(ax) > 0 &&
|
||||||
|
ax->ne[1] > 0;
|
||||||
|
bool needs_audio_connector_pe =
|
||||||
|
run_audio_context &&
|
||||||
|
params.use_audio_connector &&
|
||||||
|
context != nullptr &&
|
||||||
|
(context->ne[0] == params.audio_connector_hidden_size ||
|
||||||
|
((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
|
||||||
|
context->ne[0] == params.caption_channels * 2) &&
|
||||||
|
context->ne[1] < 1024));
|
||||||
ggml_tensor* audio_connector_pe = nullptr;
|
ggml_tensor* audio_connector_pe = nullptr;
|
||||||
if (params.use_audio_connector && context != nullptr && context->ne[0] == params.audio_connector_hidden_size) {
|
if (needs_audio_connector_pe) {
|
||||||
int64_t seq_len = context->ne[1];
|
int64_t seq_len = context->ne[1];
|
||||||
int64_t target_len = std::max<int64_t>(1024, seq_len);
|
int64_t target_len = std::max<int64_t>(1024, seq_len);
|
||||||
int64_t duplications = (target_len + params.audio_connector_num_registers - 1) / params.audio_connector_num_registers;
|
int64_t duplications = (target_len + params.audio_connector_num_registers - 1) / params.audio_connector_num_registers;
|
||||||
int64_t full_len = seq_len + duplications * params.audio_connector_num_registers - seq_len;
|
int64_t full_len = seq_len + duplications * params.audio_connector_num_registers - seq_len;
|
||||||
audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.audio_connector_hidden_size));
|
audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.audio_connector_hidden_size), static_cast<int>(params.audio_connector_num_heads), 10000.f, 4096.f, true);
|
||||||
audio_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_connector_hidden_size / 2, full_len);
|
audio_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_connector_head_dim / 2, full_len * params.audio_connector_num_heads);
|
||||||
|
ggml_set_name(audio_connector_pe, "ltxav_audio_connector_pe");
|
||||||
set_backend_tensor_data(audio_connector_pe, audio_connector_pe_vec.data());
|
set_backend_tensor_data(audio_connector_pe, audio_connector_pe_vec.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1549,7 +1761,7 @@ namespace LTXV {
|
|||||||
audio_cross_pe,
|
audio_cross_pe,
|
||||||
video_connector_pe,
|
video_connector_pe,
|
||||||
audio_connector_pe);
|
audio_connector_pe);
|
||||||
auto out = recombine_audio_and_video_latents(compute_ctx, out_pair.first, out_pair.second);
|
auto out = merge_av_latents(compute_ctx, out_pair.first, out_pair.second);
|
||||||
ggml_build_forward_expand(gf, out);
|
ggml_build_forward_expand(gf, out);
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
@ -1564,7 +1776,106 @@ namespace LTXV {
|
|||||||
auto get_graph = [&]() -> ggml_cgraph* {
|
auto get_graph = [&]() -> ggml_cgraph* {
|
||||||
return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length);
|
return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length);
|
||||||
};
|
};
|
||||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
auto out = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
void test(const std::string& x_path,
|
||||||
|
const std::string& timesteps_path = "",
|
||||||
|
const std::string& context_path = "",
|
||||||
|
const std::string& audio_x_path = "",
|
||||||
|
const std::string& audio_timesteps_path = "") {
|
||||||
|
auto x = sd::load_tensor_from_file_as_tensor<float>(x_path);
|
||||||
|
GGML_ASSERT(!x.empty());
|
||||||
|
print_sd_tensor(x, false, "ltxav_x");
|
||||||
|
|
||||||
|
sd::Tensor<float> timesteps;
|
||||||
|
if (!timesteps_path.empty()) {
|
||||||
|
timesteps = sd::load_tensor_from_file_as_tensor<float>(timesteps_path);
|
||||||
|
} else {
|
||||||
|
timesteps = sd::Tensor<float>::from_vector(std::vector<float>{1.f});
|
||||||
|
}
|
||||||
|
GGML_ASSERT(!timesteps.empty());
|
||||||
|
print_sd_tensor(timesteps, false, "ltxav_timesteps");
|
||||||
|
|
||||||
|
sd::Tensor<float> context;
|
||||||
|
if (!context_path.empty()) {
|
||||||
|
context = sd::load_tensor_from_file_as_tensor<float>(context_path);
|
||||||
|
GGML_ASSERT(!context.empty());
|
||||||
|
print_sd_tensor(context, false, "ltxav_context");
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> audio_x;
|
||||||
|
int audio_length = 0;
|
||||||
|
if (!audio_x_path.empty()) {
|
||||||
|
audio_x = sd::load_tensor_from_file_as_tensor<float>(audio_x_path);
|
||||||
|
GGML_ASSERT(!audio_x.empty());
|
||||||
|
GGML_ASSERT(audio_x.dim() >= 2);
|
||||||
|
audio_length = static_cast<int>(audio_x.shape()[1]);
|
||||||
|
print_sd_tensor(audio_x, false, "ltxav_audio_x");
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> audio_timesteps;
|
||||||
|
if (!audio_timesteps_path.empty()) {
|
||||||
|
audio_timesteps = sd::load_tensor_from_file_as_tensor<float>(audio_timesteps_path);
|
||||||
|
GGML_ASSERT(!audio_timesteps.empty());
|
||||||
|
} else if (!audio_x.empty()) {
|
||||||
|
audio_timesteps = timesteps;
|
||||||
|
}
|
||||||
|
if (!audio_timesteps.empty()) {
|
||||||
|
print_sd_tensor(audio_timesteps, false, "ltxav_audio_timesteps");
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t t0 = ggml_time_ms();
|
||||||
|
auto out_opt = compute(8, x, timesteps, context, audio_x, audio_timesteps, audio_length);
|
||||||
|
int64_t t1 = ggml_time_ms();
|
||||||
|
|
||||||
|
GGML_ASSERT(!out_opt.empty());
|
||||||
|
print_sd_tensor(out_opt, false, "ltxav_out");
|
||||||
|
LOG_DEBUG("ltxav test done in %lldms", t1 - t0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void load_from_file_and_test(const std::string& model_path,
|
||||||
|
const std::string& x_path,
|
||||||
|
const std::string& timesteps_path = "",
|
||||||
|
const std::string& context_path = "",
|
||||||
|
const std::string& embeddings_path = "",
|
||||||
|
const std::string& audio_x_path = "",
|
||||||
|
const std::string& audio_timesteps_path = "") {
|
||||||
|
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||||
|
ggml_backend_t backend = ggml_backend_cpu_init();
|
||||||
|
LOG_INFO("loading ltxav from '%s'", model_path.c_str());
|
||||||
|
|
||||||
|
ModelLoader model_loader;
|
||||||
|
if (!model_loader.init_from_file_and_convert_name(model_path, "model.diffusion_model.")) {
|
||||||
|
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!embeddings_path.empty()) {
|
||||||
|
LOG_INFO("loading ltxav embeddings from '%s'", embeddings_path.c_str());
|
||||||
|
if (!model_loader.init_from_file(embeddings_path)) {
|
||||||
|
LOG_ERROR("init embeddings model loader from file failed: '%s'", embeddings_path.c_str());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
||||||
|
std::shared_ptr<LTXAVRunner> ltxav = std::make_shared<LTXAVRunner>(backend,
|
||||||
|
false,
|
||||||
|
tensor_storage_map,
|
||||||
|
"model.diffusion_model");
|
||||||
|
|
||||||
|
ltxav->alloc_params_buffer();
|
||||||
|
std::map<std::string, ggml_tensor*> tensors;
|
||||||
|
ltxav->get_param_tensors(tensors, "model.diffusion_model");
|
||||||
|
|
||||||
|
if (!model_loader.load_tensors(tensors)) {
|
||||||
|
LOG_ERROR("load tensors from model loader failed");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INFO("ltxav model loaded");
|
||||||
|
ltxav->test(x_path, timesteps_path, context_path, audio_x_path, audio_timesteps_path);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,7 @@
|
|||||||
#include "diffusion_model.hpp"
|
#include "diffusion_model.hpp"
|
||||||
#include "esrgan.hpp"
|
#include "esrgan.hpp"
|
||||||
#include "lora.hpp"
|
#include "lora.hpp"
|
||||||
#include "ltx_vae.h"
|
#include "ltx_vae.hpp"
|
||||||
#include "pmid.hpp"
|
#include "pmid.hpp"
|
||||||
#include "sample-cache.h"
|
#include "sample-cache.h"
|
||||||
#include "tae.hpp"
|
#include "tae.hpp"
|
||||||
@ -3742,8 +3742,8 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
||||||
latents.audio_length = get_ltxav_num_audio_latents(request->frames, request->fps);
|
latents.audio_length = 0;
|
||||||
latents.audio_latent = sd::zeros<float>({16, latents.audio_length, 8, 1});
|
latents.audio_latent = {};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
||||||
@ -3923,9 +3923,8 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
|
|||||||
latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
|
latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!latents.audio_latent.empty()) {
|
// Pipeline-level audio support is temporarily disabled. Keep the model-side
|
||||||
latents.init_latent = pack_ltxav_audio_and_video_latents(latents.init_latent, latents.audio_latent);
|
// AV implementation intact, but feed pure video latents through vid_gen.
|
||||||
}
|
|
||||||
|
|
||||||
return latents;
|
return latents;
|
||||||
}
|
}
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user