mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-05-08 16:28:53 +00:00
Compare commits
1 Commits
2ca782a65a
...
ca7e008d78
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ca7e008d78 |
@ -19,6 +19,8 @@
|
|||||||
#include "common/media_io.h"
|
#include "common/media_io.h"
|
||||||
#include "common/resource_owners.hpp"
|
#include "common/resource_owners.hpp"
|
||||||
#include "image_metadata.h"
|
#include "image_metadata.h"
|
||||||
|
#include "llm.hpp"
|
||||||
|
#include "ltx_vae_test.h"
|
||||||
|
|
||||||
namespace fs = std::filesystem;
|
namespace fs = std::filesystem;
|
||||||
|
|
||||||
@ -500,6 +502,27 @@ int main(int argc, const char* argv[]) {
|
|||||||
SDContextParams ctx_params;
|
SDContextParams ctx_params;
|
||||||
SDGenerationParams gen_params;
|
SDGenerationParams gen_params;
|
||||||
|
|
||||||
|
cli_params.verbose = true;
|
||||||
|
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
||||||
|
{
|
||||||
|
const bool run_ltx_vae_test = false;
|
||||||
|
const std::string model_path = "E:/Code/ComfyUI/models/vae/ltx-2.3-22b-dev_video_vae.safetensors";
|
||||||
|
const std::string input_path = "E:/Code/sd.cpp/build/ltx_vae_z.bin";
|
||||||
|
if (run_ltx_vae_test) {
|
||||||
|
ltx_vae_load_from_file_and_test(model_path, input_path);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// cli_params.verbose = true;
|
||||||
|
// sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
||||||
|
// GemmaTokenizer tokenizer;
|
||||||
|
// auto tokens = tokenizer.tokenize("<html> 一只可爱的小猫");
|
||||||
|
// for (auto token : tokens) {
|
||||||
|
// LOG_INFO("%d", token);
|
||||||
|
// }
|
||||||
|
// return 0;
|
||||||
|
|
||||||
parse_args(argc, argv, cli_params, ctx_params, gen_params);
|
parse_args(argc, argv, cli_params, ctx_params, gen_params);
|
||||||
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
|
||||||
log_verbose = cli_params.verbose;
|
log_verbose = cli_params.verbose;
|
||||||
|
|||||||
@ -103,64 +103,6 @@ namespace DiT {
|
|||||||
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
|
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline ggml_tensor* patchify(ggml_context* ctx,
|
|
||||||
ggml_tensor* x,
|
|
||||||
int pt,
|
|
||||||
int ph,
|
|
||||||
int pw,
|
|
||||||
int64_t N = 1) {
|
|
||||||
// x: [N*C, T, H, W]
|
|
||||||
// return: [N, h*w, C*pt*ph*pw]
|
|
||||||
int64_t C = x->ne[3] / N;
|
|
||||||
int64_t T = x->ne[2];
|
|
||||||
int64_t H = x->ne[1];
|
|
||||||
int64_t W = x->ne[0];
|
|
||||||
int64_t t_len = T / pt;
|
|
||||||
int64_t h_len = H / ph;
|
|
||||||
int64_t w_len = W / pw;
|
|
||||||
|
|
||||||
GGML_ASSERT(C * N == x->ne[3]);
|
|
||||||
GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
|
|
||||||
|
|
||||||
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N); // [N*C*t_len, pt, h_len*ph, w_len*pw]
|
|
||||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, h_len*ph, pt, w_len*pw]
|
|
||||||
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N); // [N*C*t_len*h_len, ph, pt, w_len*pw]
|
|
||||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt, ph, w_len*pw]
|
|
||||||
x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt*ph, w_len, pw]
|
|
||||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, w_len, pt*ph, pw]
|
|
||||||
x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N); // [N, C, t_len*h_len*w_len, pt*ph*pw]
|
|
||||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N, t_len*h_len*w_len, C, pt*ph*pw]
|
|
||||||
x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1); // [N, t_len*h_len*w_len, C*pt*ph*pw]
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline ggml_tensor* unpatchify(ggml_context* ctx,
|
|
||||||
ggml_tensor* x,
|
|
||||||
int64_t t_len,
|
|
||||||
int64_t h_len,
|
|
||||||
int64_t w_len,
|
|
||||||
int pt,
|
|
||||||
int ph,
|
|
||||||
int pw) {
|
|
||||||
// x: [N, t_len*h_len*w_len, pt*ph*pw*C]
|
|
||||||
// return: [N*C, t_len*pt, h_len*ph, w_len*pw]
|
|
||||||
int64_t N = x->ne[3];
|
|
||||||
int64_t C = x->ne[0] / pt / ph / pw;
|
|
||||||
|
|
||||||
GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
|
|
||||||
|
|
||||||
x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N); // [N, t_len*h_len*w_len, pt*ph*pw, C]
|
|
||||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, t_len*h_len*w_len, pt*ph*pw]
|
|
||||||
x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N); // [N*C*t_len*h_len, w_len, pt*ph, pw]
|
|
||||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt*ph, w_len, pw]
|
|
||||||
x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt, ph, w_len*pw]
|
|
||||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, ph, pt, w_len*pw]
|
|
||||||
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N); // [N*C*t_len, h_len*ph, pt, w_len*pw]
|
|
||||||
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, pt, h_len*ph, w_len*pw]
|
|
||||||
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N); // [N*C, t_len*pt, h_len*ph, w_len*pw]
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
} // namespace DiT
|
} // namespace DiT
|
||||||
|
|
||||||
#endif // __COMMON_DIT_HPP__
|
#endif // __COMMON_DIT_HPP__
|
||||||
|
|||||||
@ -1675,22 +1675,13 @@ struct WeightAdapter {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct GGMLRunnerContext {
|
struct GGMLRunnerContext {
|
||||||
ggml_backend_t backend = nullptr;
|
ggml_backend_t backend = nullptr;
|
||||||
ggml_context* ggml_ctx = nullptr;
|
ggml_context* ggml_ctx = nullptr;
|
||||||
bool flash_attn_enabled = false;
|
bool flash_attn_enabled = false;
|
||||||
bool conv2d_direct_enabled = false;
|
bool conv2d_direct_enabled = false;
|
||||||
bool circular_x_enabled = false;
|
bool circular_x_enabled = false;
|
||||||
bool circular_y_enabled = false;
|
bool circular_y_enabled = false;
|
||||||
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
|
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
|
||||||
std::unordered_map<ggml_tensor*, std::string>* debug_tensors = nullptr;
|
|
||||||
|
|
||||||
void capture_tensor(const std::string& name, ggml_tensor* tensor) {
|
|
||||||
if (debug_tensors == nullptr || tensor == nullptr) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
ggml_set_output(tensor);
|
|
||||||
(*debug_tensors)[tensor] = name;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct GGMLRunner {
|
struct GGMLRunner {
|
||||||
@ -1722,7 +1713,6 @@ protected:
|
|||||||
|
|
||||||
std::map<ggml_tensor*, const void*> backend_tensor_data_map;
|
std::map<ggml_tensor*, const void*> backend_tensor_data_map;
|
||||||
std::map<std::string, ggml_tensor*> cache_tensor_map; // name -> tensor
|
std::map<std::string, ggml_tensor*> cache_tensor_map; // name -> tensor
|
||||||
std::unordered_map<ggml_tensor*, std::string> debug_tensors;
|
|
||||||
const std::string final_result_name = "ggml_runner_final_result_tensor";
|
const std::string final_result_name = "ggml_runner_final_result_tensor";
|
||||||
|
|
||||||
bool flash_attn_enabled = false;
|
bool flash_attn_enabled = false;
|
||||||
@ -1809,7 +1799,6 @@ protected:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void free_compute_ctx() {
|
void free_compute_ctx() {
|
||||||
debug_tensors.clear();
|
|
||||||
if (compute_ctx != nullptr) {
|
if (compute_ctx != nullptr) {
|
||||||
ggml_free(compute_ctx);
|
ggml_free(compute_ctx);
|
||||||
compute_ctx = nullptr;
|
compute_ctx = nullptr;
|
||||||
@ -1845,11 +1834,6 @@ protected:
|
|||||||
auto result = ggml_graph_node(gf, -1);
|
auto result = ggml_graph_node(gf, -1);
|
||||||
ggml_set_name(result, final_result_name.c_str());
|
ggml_set_name(result, final_result_name.c_str());
|
||||||
}
|
}
|
||||||
for (const auto& entry : debug_tensors) {
|
|
||||||
if (entry.first != nullptr) {
|
|
||||||
ggml_build_forward_expand(gf, entry.first);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
prepare_build_in_tensor_after(gf);
|
prepare_build_in_tensor_after(gf);
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
@ -1919,21 +1903,6 @@ protected:
|
|||||||
for (auto& kv : backend_tensor_data_map) {
|
for (auto& kv : backend_tensor_data_map) {
|
||||||
auto tensor = kv.first;
|
auto tensor = kv.first;
|
||||||
auto data = kv.second;
|
auto data = kv.second;
|
||||||
if (tensor == nullptr || data == nullptr) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const char* name = ggml_get_name(tensor);
|
|
||||||
if (tensor->buffer == nullptr) {
|
|
||||||
LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
|
|
||||||
get_desc().c_str(),
|
|
||||||
name != nullptr ? name : "",
|
|
||||||
(long long)tensor->ne[0],
|
|
||||||
(long long)tensor->ne[1],
|
|
||||||
(long long)tensor->ne[2],
|
|
||||||
(long long)tensor->ne[3],
|
|
||||||
ggml_type_name(tensor->type));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
|
ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
|
||||||
}
|
}
|
||||||
@ -2056,7 +2025,6 @@ public:
|
|||||||
runner_ctx.circular_x_enabled = circular_x_enabled;
|
runner_ctx.circular_x_enabled = circular_x_enabled;
|
||||||
runner_ctx.circular_y_enabled = circular_y_enabled;
|
runner_ctx.circular_y_enabled = circular_y_enabled;
|
||||||
runner_ctx.weight_adapter = weight_adapter;
|
runner_ctx.weight_adapter = weight_adapter;
|
||||||
runner_ctx.debug_tensors = &debug_tensors;
|
|
||||||
return runner_ctx;
|
return runner_ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2195,21 +2163,6 @@ public:
|
|||||||
LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
|
LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
|
||||||
return std::nullopt;
|
return std::nullopt;
|
||||||
}
|
}
|
||||||
for (const auto& entry : debug_tensors) {
|
|
||||||
auto tensor = entry.first;
|
|
||||||
if (tensor == nullptr) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (tensor->type != GGML_TYPE_F32) {
|
|
||||||
LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s",
|
|
||||||
get_desc().c_str(),
|
|
||||||
entry.second.c_str(),
|
|
||||||
ggml_type_name(tensor->type));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
auto debug_tensor = sd::make_sd_tensor_from_ggml<float>(tensor);
|
|
||||||
print_sd_tensor(debug_tensor, false, entry.second.c_str());
|
|
||||||
}
|
|
||||||
copy_cache_tensors_to_cache_buffer();
|
copy_cache_tensors_to_cache_buffer();
|
||||||
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
|
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
|
||||||
std::optional<sd::Tensor<T>> output;
|
std::optional<sd::Tensor<T>> output;
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef __SD_LTX_VAE_HPP__
|
#ifndef __SD_LTX_VAE_H__
|
||||||
#define __SD_LTX_VAE_HPP__
|
#define __SD_LTX_VAE_H__
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
@ -107,20 +107,20 @@ namespace LTXVAE {
|
|||||||
auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);
|
auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);
|
||||||
|
|
||||||
if (causal) {
|
if (causal) {
|
||||||
auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
|
auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
|
||||||
auto first_frame_pad = first_frame;
|
auto first_frame_pad = first_frame;
|
||||||
for (int i = 1; i < time_kernel_size - 1; i++) {
|
for (int i = 1; i < time_kernel_size - 1; i++) {
|
||||||
first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
|
first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
|
||||||
}
|
}
|
||||||
x = ggml_concat(ctx->ggml_ctx, first_frame_pad, x, 2);
|
x = ggml_concat(ctx->ggml_ctx, first_frame_pad, x, 2);
|
||||||
} else {
|
} else {
|
||||||
auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
|
auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
|
||||||
auto first_frame_pad = first_frame;
|
auto first_frame_pad = first_frame;
|
||||||
for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
|
for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
|
||||||
first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
|
first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto last_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, x->ne[2] - 1, x->ne[2]);
|
auto last_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, x->ne[2] - 1, x->ne[2]);
|
||||||
auto last_frame_pad = last_frame;
|
auto last_frame_pad = last_frame;
|
||||||
for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
|
for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
|
||||||
last_frame_pad = ggml_concat(ctx->ggml_ctx, last_frame_pad, last_frame, 2);
|
last_frame_pad = ggml_concat(ctx->ggml_ctx, last_frame_pad, last_frame, 2);
|
||||||
@ -175,7 +175,7 @@ namespace LTXVAE {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
ResnetBlock3D(int64_t channels,
|
ResnetBlock3D(int64_t channels,
|
||||||
float eps = 1e-6f,
|
float eps = 1e-6f,
|
||||||
bool timestep_conditioning = false)
|
bool timestep_conditioning = false)
|
||||||
: channels(channels), timestep_conditioning(timestep_conditioning) {
|
: channels(channels), timestep_conditioning(timestep_conditioning) {
|
||||||
blocks["norm1"] = std::make_shared<PixelNorm3D>(eps);
|
blocks["norm1"] = std::make_shared<PixelNorm3D>(eps);
|
||||||
@ -333,9 +333,9 @@ namespace LTXVAE {
|
|||||||
const int64_t factor = static_cast<int64_t>(factor_t) * static_cast<int64_t>(factor_s) * static_cast<int64_t>(factor_s);
|
const int64_t factor = static_cast<int64_t>(factor_t) * static_cast<int64_t>(factor_s) * static_cast<int64_t>(factor_s);
|
||||||
GGML_ASSERT(out_channels % factor == 0);
|
GGML_ASSERT(out_channels % factor == 0);
|
||||||
|
|
||||||
blocks["conv"] = std::make_shared<CausalConv3d>(in_channels, out_channels / factor, 3);
|
blocks["conv"] = std::make_shared<CausalConv3d>(in_channels, out_channels / factor, 3);
|
||||||
blocks["skip_downsample"] = std::make_shared<WAN::AvgDown3D>(in_channels, out_channels, factor_t, factor_s);
|
blocks["skip_downsample"] = std::make_shared<WAN::AvgDown3D>(in_channels, out_channels, factor_t, factor_s);
|
||||||
blocks["conv_downsample"] = std::make_shared<WAN::AvgDown3D>(out_channels / factor, out_channels, factor_t, factor_s);
|
blocks["conv_downsample"] = std::make_shared<WAN::AvgDown3D>(out_channels / factor, out_channels, factor_t, factor_s);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
@ -530,16 +530,16 @@ namespace LTXVAE {
|
|||||||
int64_t latent_channels;
|
int64_t latent_channels;
|
||||||
|
|
||||||
Encoder(int version,
|
Encoder(int version,
|
||||||
int patch_size = 4,
|
int patch_size = 4,
|
||||||
int64_t in_channels = 3,
|
int64_t in_channels = 3,
|
||||||
int64_t latent_channels = 128)
|
int64_t latent_channels = 128)
|
||||||
: version(version),
|
: version(version),
|
||||||
patch_size(patch_size),
|
patch_size(patch_size),
|
||||||
in_channels(in_channels),
|
in_channels(in_channels),
|
||||||
latent_channels(latent_channels) {
|
latent_channels(latent_channels) {
|
||||||
auto cfg = get_encoder_config(version);
|
auto cfg = get_encoder_config(version);
|
||||||
int64_t channels = 128;
|
int64_t channels = 128;
|
||||||
int64_t in_dim = in_channels * patch_size * patch_size;
|
int64_t in_dim = in_channels * patch_size * patch_size;
|
||||||
|
|
||||||
blocks["conv_in"] = std::make_shared<CausalConv3d>(in_dim, channels, 3);
|
blocks["conv_in"] = std::make_shared<CausalConv3d>(in_dim, channels, 3);
|
||||||
|
|
||||||
@ -547,29 +547,29 @@ namespace LTXVAE {
|
|||||||
const auto& block = cfg.blocks[block_idx];
|
const auto& block = cfg.blocks[block_idx];
|
||||||
if (block.type == "res_x") {
|
if (block.type == "res_x") {
|
||||||
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<UNetMidBlock3D>(channels,
|
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<UNetMidBlock3D>(channels,
|
||||||
block.num_layers,
|
block.num_layers,
|
||||||
false);
|
false);
|
||||||
} else if (block.type == "compress_space_res") {
|
} else if (block.type == "compress_space_res") {
|
||||||
int64_t next_channels = channels * block.multiplier;
|
int64_t next_channels = channels * block.multiplier;
|
||||||
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
|
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
|
||||||
next_channels,
|
next_channels,
|
||||||
1,
|
1,
|
||||||
2);
|
2);
|
||||||
channels = next_channels;
|
channels = next_channels;
|
||||||
} else if (block.type == "compress_time_res") {
|
} else if (block.type == "compress_time_res") {
|
||||||
int64_t next_channels = channels * block.multiplier;
|
int64_t next_channels = channels * block.multiplier;
|
||||||
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
|
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
|
||||||
next_channels,
|
next_channels,
|
||||||
2,
|
2,
|
||||||
1);
|
1);
|
||||||
channels = next_channels;
|
channels = next_channels;
|
||||||
} else if (block.type == "compress_all_res") {
|
} else if (block.type == "compress_all_res") {
|
||||||
int64_t next_channels = channels * block.multiplier;
|
int64_t next_channels = channels * block.multiplier;
|
||||||
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
|
blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
|
||||||
next_channels,
|
next_channels,
|
||||||
2,
|
2,
|
||||||
2);
|
2);
|
||||||
channels = next_channels;
|
channels = next_channels;
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("Unsupported LTX VAE encoder block");
|
GGML_ABORT("Unsupported LTX VAE encoder block");
|
||||||
}
|
}
|
||||||
@ -775,7 +775,7 @@ namespace LTXVAE {
|
|||||||
auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
|
auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
|
||||||
auto latents = processor->un_normalize(ctx, z);
|
auto latents = processor->un_normalize(ctx, z);
|
||||||
auto out = decoder->forward(ctx, latents, timestep);
|
auto out = decoder->forward(ctx, latents, timestep);
|
||||||
out = WAN::WanVAE::unpatchify(ctx->ggml_ctx, out, patch_size, 1);
|
out = WAN::WanVAE::unpatchify(ctx->ggml_ctx, out, patch_size, 1);
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -936,8 +936,7 @@ struct LTXVideoVAE : public VAE {
|
|||||||
|
|
||||||
static void load_from_file_and_test(const std::string& model_path,
|
static void load_from_file_and_test(const std::string& model_path,
|
||||||
const std::string& input_path) {
|
const std::string& input_path) {
|
||||||
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||||
ggml_backend_t backend = ggml_backend_cpu_init();
|
|
||||||
LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
|
LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
|
||||||
|
|
||||||
ModelLoader model_loader;
|
ModelLoader model_loader;
|
||||||
@ -968,4 +967,4 @@ struct LTXVideoVAE : public VAE {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // __SD_LTX_VAE_HPP__
|
#endif // __SD_LTX_VAE_H__
|
||||||
8
src/ltx_vae_test.cpp
Normal file
8
src/ltx_vae_test.cpp
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
#include "ltx_vae_test.h"
|
||||||
|
|
||||||
|
#include "ltx_vae.h"
|
||||||
|
|
||||||
|
void ltx_vae_load_from_file_and_test(const std::string& model_path,
|
||||||
|
const std::string& input_path) {
|
||||||
|
LTXVideoVAE::load_from_file_and_test(model_path, input_path);
|
||||||
|
}
|
||||||
9
src/ltx_vae_test.h
Normal file
9
src/ltx_vae_test.h
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
#ifndef __SD_LTX_VAE_TEST_H__
|
||||||
|
#define __SD_LTX_VAE_TEST_H__
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
void ltx_vae_load_from_file_and_test(const std::string& model_path,
|
||||||
|
const std::string& input_path);
|
||||||
|
|
||||||
|
#endif // __SD_LTX_VAE_TEST_H__
|
||||||
781
src/ltxv.hpp
781
src/ltxv.hpp
File diff suppressed because it is too large
Load Diff
@ -14,7 +14,7 @@
|
|||||||
#include "diffusion_model.hpp"
|
#include "diffusion_model.hpp"
|
||||||
#include "esrgan.hpp"
|
#include "esrgan.hpp"
|
||||||
#include "lora.hpp"
|
#include "lora.hpp"
|
||||||
#include "ltx_vae.hpp"
|
#include "ltx_vae.h"
|
||||||
#include "pmid.hpp"
|
#include "pmid.hpp"
|
||||||
#include "sample-cache.h"
|
#include "sample-cache.h"
|
||||||
#include "tae.hpp"
|
#include "tae.hpp"
|
||||||
@ -2966,10 +2966,10 @@ static sd::Tensor<float> pack_ltxav_audio_and_video_latents(const sd::Tensor<flo
|
|||||||
static int get_ltxav_num_audio_latents(int frames, int fps) {
|
static int get_ltxav_num_audio_latents(int frames, int fps) {
|
||||||
GGML_ASSERT(frames > 0);
|
GGML_ASSERT(frames > 0);
|
||||||
GGML_ASSERT(fps > 0);
|
GGML_ASSERT(fps > 0);
|
||||||
constexpr float kSampleRate = 16000.0f;
|
constexpr float kSampleRate = 16000.0f;
|
||||||
constexpr float kMelHopLength = 160.0f;
|
constexpr float kMelHopLength = 160.0f;
|
||||||
constexpr float kAudioLatentDownsample = 4.0f;
|
constexpr float kAudioLatentDownsample = 4.0f;
|
||||||
constexpr float kLatentsPerSecond = kSampleRate / kMelHopLength / kAudioLatentDownsample;
|
constexpr float kLatentsPerSecond = kSampleRate / kMelHopLength / kAudioLatentDownsample;
|
||||||
return static_cast<int>(std::ceil((static_cast<float>(frames) / static_cast<float>(fps)) * kLatentsPerSecond));
|
return static_cast<int>(std::ceil((static_cast<float>(frames) / static_cast<float>(fps)) * kLatentsPerSecond));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3742,8 +3742,8 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
||||||
latents.audio_length = 0;
|
latents.audio_length = get_ltxav_num_audio_latents(request->frames, request->fps);
|
||||||
latents.audio_latent = {};
|
latents.audio_latent = sd::zeros<float>({16, latents.audio_length, 8, 1});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
|
||||||
@ -3923,8 +3923,9 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
|
|||||||
latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
|
latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pipeline-level audio support is temporarily disabled. Keep the model-side
|
if (!latents.audio_latent.empty()) {
|
||||||
// AV implementation intact, but feed pure video latents through vid_gen.
|
latents.init_latent = pack_ltxav_audio_and_video_latents(latents.init_latent, latents.audio_latent);
|
||||||
|
}
|
||||||
|
|
||||||
return latents;
|
return latents;
|
||||||
}
|
}
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user