Compare commits

..

1 Commits

Author SHA1 Message Date
leejet
ca7e008d78 wip 2026-04-27 21:43:22 +08:00
10 changed files with 326 additions and 702 deletions

View File

@ -19,6 +19,8 @@
#include "common/media_io.h"
#include "common/resource_owners.hpp"
#include "image_metadata.h"
#include "llm.hpp"
#include "ltx_vae_test.h"
namespace fs = std::filesystem;
@ -500,6 +502,27 @@ int main(int argc, const char* argv[]) {
SDContextParams ctx_params;
SDGenerationParams gen_params;
cli_params.verbose = true;
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
{
const bool run_ltx_vae_test = false;
const std::string model_path = "E:/Code/ComfyUI/models/vae/ltx-2.3-22b-dev_video_vae.safetensors";
const std::string input_path = "E:/Code/sd.cpp/build/ltx_vae_z.bin";
if (run_ltx_vae_test) {
ltx_vae_load_from_file_and_test(model_path, input_path);
return 0;
}
}
// cli_params.verbose = true;
// sd_set_log_callback(sd_log_cb, (void*)&cli_params);
// GemmaTokenizer tokenizer;
// auto tokens = tokenizer.tokenize("<html> 一只可爱的小猫");
// for (auto token : tokens) {
// LOG_INFO("%d", token);
// }
// return 0;
parse_args(argc, argv, cli_params, ctx_params, gen_params);
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
log_verbose = cli_params.verbose;

View File

@ -103,64 +103,6 @@ namespace DiT {
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
return x;
}
inline ggml_tensor* patchify(ggml_context* ctx,
ggml_tensor* x,
int pt,
int ph,
int pw,
int64_t N = 1) {
// x: [N*C, T, H, W]
// return: [N, h*w, C*pt*ph*pw]
int64_t C = x->ne[3] / N;
int64_t T = x->ne[2];
int64_t H = x->ne[1];
int64_t W = x->ne[0];
int64_t t_len = T / pt;
int64_t h_len = H / ph;
int64_t w_len = W / pw;
GGML_ASSERT(C * N == x->ne[3]);
GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N); // [N*C*t_len, pt, h_len*ph, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, h_len*ph, pt, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N); // [N*C*t_len*h_len, ph, pt, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt, ph, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt*ph, w_len, pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, w_len, pt*ph, pw]
x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N); // [N, C, t_len*h_len*w_len, pt*ph*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N, t_len*h_len*w_len, C, pt*ph*pw]
x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1); // [N, t_len*h_len*w_len, C*pt*ph*pw]
return x;
}
inline ggml_tensor* unpatchify(ggml_context* ctx,
ggml_tensor* x,
int64_t t_len,
int64_t h_len,
int64_t w_len,
int pt,
int ph,
int pw) {
// x: [N, t_len*h_len*w_len, pt*ph*pw*C]
// return: [N*C, t_len*pt, h_len*ph, w_len*pw]
int64_t N = x->ne[3];
int64_t C = x->ne[0] / pt / ph / pw;
GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N); // [N, t_len*h_len*w_len, pt*ph*pw, C]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, t_len*h_len*w_len, pt*ph*pw]
x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N); // [N*C*t_len*h_len, w_len, pt*ph, pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt*ph, w_len, pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt, ph, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, ph, pt, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N); // [N*C*t_len, h_len*ph, pt, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, pt, h_len*ph, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N); // [N*C, t_len*pt, h_len*ph, w_len*pw]
return x;
}
} // namespace DiT
#endif // __COMMON_DIT_HPP__

View File

@ -1675,22 +1675,13 @@ struct WeightAdapter {
};
struct GGMLRunnerContext {
ggml_backend_t backend = nullptr;
ggml_context* ggml_ctx = nullptr;
bool flash_attn_enabled = false;
bool conv2d_direct_enabled = false;
bool circular_x_enabled = false;
bool circular_y_enabled = false;
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
std::unordered_map<ggml_tensor*, std::string>* debug_tensors = nullptr;
void capture_tensor(const std::string& name, ggml_tensor* tensor) {
if (debug_tensors == nullptr || tensor == nullptr) {
return;
}
ggml_set_output(tensor);
(*debug_tensors)[tensor] = name;
}
ggml_backend_t backend = nullptr;
ggml_context* ggml_ctx = nullptr;
bool flash_attn_enabled = false;
bool conv2d_direct_enabled = false;
bool circular_x_enabled = false;
bool circular_y_enabled = false;
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
};
struct GGMLRunner {
@ -1722,7 +1713,6 @@ protected:
std::map<ggml_tensor*, const void*> backend_tensor_data_map;
std::map<std::string, ggml_tensor*> cache_tensor_map; // name -> tensor
std::unordered_map<ggml_tensor*, std::string> debug_tensors;
const std::string final_result_name = "ggml_runner_final_result_tensor";
bool flash_attn_enabled = false;
@ -1809,7 +1799,6 @@ protected:
}
void free_compute_ctx() {
debug_tensors.clear();
if (compute_ctx != nullptr) {
ggml_free(compute_ctx);
compute_ctx = nullptr;
@ -1845,11 +1834,6 @@ protected:
auto result = ggml_graph_node(gf, -1);
ggml_set_name(result, final_result_name.c_str());
}
for (const auto& entry : debug_tensors) {
if (entry.first != nullptr) {
ggml_build_forward_expand(gf, entry.first);
}
}
prepare_build_in_tensor_after(gf);
return gf;
}
@ -1919,21 +1903,6 @@ protected:
for (auto& kv : backend_tensor_data_map) {
auto tensor = kv.first;
auto data = kv.second;
if (tensor == nullptr || data == nullptr) {
continue;
}
const char* name = ggml_get_name(tensor);
if (tensor->buffer == nullptr) {
LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
get_desc().c_str(),
name != nullptr ? name : "",
(long long)tensor->ne[0],
(long long)tensor->ne[1],
(long long)tensor->ne[2],
(long long)tensor->ne[3],
ggml_type_name(tensor->type));
continue;
}
ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
}
@ -2056,7 +2025,6 @@ public:
runner_ctx.circular_x_enabled = circular_x_enabled;
runner_ctx.circular_y_enabled = circular_y_enabled;
runner_ctx.weight_adapter = weight_adapter;
runner_ctx.debug_tensors = &debug_tensors;
return runner_ctx;
}
@ -2195,21 +2163,6 @@ public:
LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
return std::nullopt;
}
for (const auto& entry : debug_tensors) {
auto tensor = entry.first;
if (tensor == nullptr) {
continue;
}
if (tensor->type != GGML_TYPE_F32) {
LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s",
get_desc().c_str(),
entry.second.c_str(),
ggml_type_name(tensor->type));
continue;
}
auto debug_tensor = sd::make_sd_tensor_from_ggml<float>(tensor);
print_sd_tensor(debug_tensor, false, entry.second.c_str());
}
copy_cache_tensors_to_cache_buffer();
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
std::optional<sd::Tensor<T>> output;

View File

@ -1,5 +1,5 @@
#ifndef __SD_LTX_VAE_HPP__
#define __SD_LTX_VAE_HPP__
#ifndef __SD_LTX_VAE_H__
#define __SD_LTX_VAE_H__
#include <fstream>
#include <memory>
@ -107,20 +107,20 @@ namespace LTXVAE {
auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);
if (causal) {
auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
auto first_frame_pad = first_frame;
for (int i = 1; i < time_kernel_size - 1; i++) {
first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
}
x = ggml_concat(ctx->ggml_ctx, first_frame_pad, x, 2);
} else {
auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
auto first_frame_pad = first_frame;
for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
}
auto last_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, x->ne[2] - 1, x->ne[2]);
auto last_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, x->ne[2] - 1, x->ne[2]);
auto last_frame_pad = last_frame;
for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
last_frame_pad = ggml_concat(ctx->ggml_ctx, last_frame_pad, last_frame, 2);
@ -175,7 +175,7 @@ namespace LTXVAE {
public:
ResnetBlock3D(int64_t channels,
float eps = 1e-6f,
float eps = 1e-6f,
bool timestep_conditioning = false)
: channels(channels), timestep_conditioning(timestep_conditioning) {
blocks["norm1"] = std::make_shared<PixelNorm3D>(eps);
@ -333,9 +333,9 @@ namespace LTXVAE {
const int64_t factor = static_cast<int64_t>(factor_t) * static_cast<int64_t>(factor_s) * static_cast<int64_t>(factor_s);
GGML_ASSERT(out_channels % factor == 0);
blocks["conv"] = std::make_shared<CausalConv3d>(in_channels, out_channels / factor, 3);
blocks["skip_downsample"] = std::make_shared<WAN::AvgDown3D>(in_channels, out_channels, factor_t, factor_s);
blocks["conv_downsample"] = std::make_shared<WAN::AvgDown3D>(out_channels / factor, out_channels, factor_t, factor_s);
blocks["conv"] = std::make_shared<CausalConv3d>(in_channels, out_channels / factor, 3);
blocks["skip_downsample"] = std::make_shared<WAN::AvgDown3D>(in_channels, out_channels, factor_t, factor_s);
blocks["conv_downsample"] = std::make_shared<WAN::AvgDown3D>(out_channels / factor, out_channels, factor_t, factor_s);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
@ -530,16 +530,16 @@ namespace LTXVAE {
int64_t latent_channels;
Encoder(int version,
int patch_size = 4,
int64_t in_channels = 3,
int patch_size = 4,
int64_t in_channels = 3,
int64_t latent_channels = 128)
: version(version),
patch_size(patch_size),
in_channels(in_channels),
latent_channels(latent_channels) {
auto cfg = get_encoder_config(version);
int64_t channels = 128;
int64_t in_dim = in_channels * patch_size * patch_size;
auto cfg = get_encoder_config(version);
int64_t channels = 128;
int64_t in_dim = in_channels * patch_size * patch_size;
blocks["conv_in"] = std::make_shared<CausalConv3d>(in_dim, channels, 3);
@ -547,29 +547,29 @@ namespace LTXVAE {
const auto& block = cfg.blocks[block_idx];
if (block.type == "res_x") {
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<UNetMidBlock3D>(channels,
block.num_layers,
false);
block.num_layers,
false);
} else if (block.type == "compress_space_res") {
int64_t next_channels = channels * block.multiplier;
int64_t next_channels = channels * block.multiplier;
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
next_channels,
1,
2);
channels = next_channels;
next_channels,
1,
2);
channels = next_channels;
} else if (block.type == "compress_time_res") {
int64_t next_channels = channels * block.multiplier;
int64_t next_channels = channels * block.multiplier;
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
next_channels,
2,
1);
channels = next_channels;
next_channels,
2,
1);
channels = next_channels;
} else if (block.type == "compress_all_res") {
int64_t next_channels = channels * block.multiplier;
int64_t next_channels = channels * block.multiplier;
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
next_channels,
2,
2);
channels = next_channels;
next_channels,
2,
2);
channels = next_channels;
} else {
GGML_ABORT("Unsupported LTX VAE encoder block");
}
@ -775,7 +775,7 @@ namespace LTXVAE {
auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
auto latents = processor->un_normalize(ctx, z);
auto out = decoder->forward(ctx, latents, timestep);
out = WAN::WanVAE::unpatchify(ctx->ggml_ctx, out, patch_size, 1);
out = WAN::WanVAE::unpatchify(ctx->ggml_ctx, out, patch_size, 1);
return out;
}
@ -936,8 +936,7 @@ struct LTXVideoVAE : public VAE {
static void load_from_file_and_test(const std::string& model_path,
const std::string& input_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = ggml_backend_cuda_init(0);
LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
ModelLoader model_loader;
@ -968,4 +967,4 @@ struct LTXVideoVAE : public VAE {
}
};
#endif // __SD_LTX_VAE_HPP__
#endif // __SD_LTX_VAE_H__

8
src/ltx_vae_test.cpp Normal file
View File

@ -0,0 +1,8 @@
#include "ltx_vae_test.h"
#include "ltx_vae.h"
void ltx_vae_load_from_file_and_test(const std::string& model_path,
const std::string& input_path) {
LTXVideoVAE::load_from_file_and_test(model_path, input_path);
}

9
src/ltx_vae_test.h Normal file
View File

@ -0,0 +1,9 @@
#ifndef __SD_LTX_VAE_TEST_H__
#define __SD_LTX_VAE_TEST_H__
#include <string>
void ltx_vae_load_from_file_and_test(const std::string& model_path,
const std::string& input_path);
#endif // __SD_LTX_VAE_TEST_H__

File diff suppressed because it is too large Load Diff

View File

@ -14,7 +14,7 @@
#include "diffusion_model.hpp"
#include "esrgan.hpp"
#include "lora.hpp"
#include "ltx_vae.hpp"
#include "ltx_vae.h"
#include "pmid.hpp"
#include "sample-cache.h"
#include "tae.hpp"
@ -2966,10 +2966,10 @@ static sd::Tensor<float> pack_ltxav_audio_and_video_latents(const sd::Tensor<flo
static int get_ltxav_num_audio_latents(int frames, int fps) {
GGML_ASSERT(frames > 0);
GGML_ASSERT(fps > 0);
constexpr float kSampleRate = 16000.0f;
constexpr float kMelHopLength = 160.0f;
constexpr float kAudioLatentDownsample = 4.0f;
constexpr float kLatentsPerSecond = kSampleRate / kMelHopLength / kAudioLatentDownsample;
constexpr float kSampleRate = 16000.0f;
constexpr float kMelHopLength = 160.0f;
constexpr float kAudioLatentDownsample = 4.0f;
constexpr float kLatentsPerSecond = kSampleRate / kMelHopLength / kAudioLatentDownsample;
return static_cast<int>(std::ceil((static_cast<float>(frames) / static_cast<float>(fps)) * kLatentsPerSecond));
}
@ -3742,8 +3742,8 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
}
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
latents.audio_length = 0;
latents.audio_latent = {};
latents.audio_length = get_ltxav_num_audio_latents(request->frames, request->fps);
latents.audio_latent = sd::zeros<float>({16, latents.audio_length, 8, 1});
}
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
@ -3923,8 +3923,9 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
}
// Pipeline-level audio support is temporarily disabled. Keep the model-side
// AV implementation intact, but feed pure video latents through vid_gen.
if (!latents.audio_latent.empty()) {
latents.init_latent = pack_ltxav_audio_and_video_latents(latents.init_latent, latents.audio_latent);
}
return latents;
}

Binary file not shown.