wip

2026-05-08 16:28:53 +00:00 · 2026-04-27 21:43:22 +08:00
10 changed files with 326 additions and 702 deletions
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -19,6 +19,8 @@
 #include "common/media_io.h"
 #include "common/resource_owners.hpp"
 #include "image_metadata.h"
 #include "llm.hpp"
 #include "ltx_vae_test.h"
 namespace fs = std::filesystem;
@ -500,6 +502,27 @@ int main(int argc, const char* argv[]) {
    SDContextParams ctx_params;
    SDGenerationParams gen_params;
    cli_params.verbose = true;
    sd_set_log_callback(sd_log_cb, (void*)&cli_params);
    {
        const bool run_ltx_vae_test  = false;
        const std::string model_path = "E:/Code/ComfyUI/models/vae/ltx-2.3-22b-dev_video_vae.safetensors";
        const std::string input_path = "E:/Code/sd.cpp/build/ltx_vae_z.bin";
        if (run_ltx_vae_test) {
            ltx_vae_load_from_file_and_test(model_path, input_path);
            return 0;
        }
    }
    // cli_params.verbose = true;
    // sd_set_log_callback(sd_log_cb, (void*)&cli_params);
    // GemmaTokenizer tokenizer;
    // auto tokens = tokenizer.tokenize("<html> 一只可爱的小猫");
    // for (auto token : tokens) {
    //     LOG_INFO("%d", token);
    // }
    // return 0;
    parse_args(argc, argv, cli_params, ctx_params, gen_params);
    sd_set_log_callback(sd_log_cb, (void*)&cli_params);
    log_verbose = cli_params.verbose;
--- a/src/common_dit.hpp
+++ b/src/common_dit.hpp
@ -103,64 +103,6 @@ namespace DiT {
        x         = ggml_ext_slice(ctx, x, 0, 0, W);               // [N, C, H, W]
        return x;
    }
    inline ggml_tensor* patchify(ggml_context* ctx,
                                 ggml_tensor* x,
                                 int pt,
                                 int ph,
                                 int pw,
                                 int64_t N = 1) {
        // x: [N*C, T, H, W]
        // return: [N, h*w, C*pt*ph*pw]
        int64_t C     = x->ne[3] / N;
        int64_t T     = x->ne[2];
        int64_t H     = x->ne[1];
        int64_t W     = x->ne[0];
        int64_t t_len = T / pt;
        int64_t h_len = H / ph;
        int64_t w_len = W / pw;
        GGML_ASSERT(C * N == x->ne[3]);
        GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
        x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N);      // [N*C*t_len, pt, h_len*ph, w_len*pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N*C*t_len, h_len*ph, pt, w_len*pw]
        x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N);      // [N*C*t_len*h_len, ph, pt, w_len*pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N*C*t_len*h_len, pt, ph, w_len*pw]
        x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N);      // [N*C*t_len*h_len, pt*ph, w_len, pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N*C*t_len*h_len, w_len, pt*ph, pw]
        x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N);      // [N, C, t_len*h_len*w_len, pt*ph*pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N, t_len*h_len*w_len, C, pt*ph*pw]
        x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1);  // [N, t_len*h_len*w_len, C*pt*ph*pw]
        return x;
    }
    inline ggml_tensor* unpatchify(ggml_context* ctx,
                                   ggml_tensor* x,
                                   int64_t t_len,
                                   int64_t h_len,
                                   int64_t w_len,
                                   int pt,
                                   int ph,
                                   int pw) {
        // x: [N, t_len*h_len*w_len, pt*ph*pw*C]
        // return: [N*C, t_len*pt, h_len*ph, w_len*pw]
        int64_t N = x->ne[3];
        int64_t C = x->ne[0] / pt / ph / pw;
        GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
        x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N);  // [N, t_len*h_len*w_len, pt*ph*pw, C]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3));      // [N, C, t_len*h_len*w_len, pt*ph*pw]
        x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N);  // [N*C*t_len*h_len, w_len, pt*ph, pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));      // [N*C*t_len*h_len, pt*ph, w_len, pw]
        x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N);  // [N*C*t_len*h_len, pt, ph, w_len*pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));      // [N*C*t_len*h_len, ph, pt, w_len*pw]
        x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N);  // [N*C*t_len, h_len*ph, pt, w_len*pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));      // [N*C*t_len, pt, h_len*ph, w_len*pw]
        x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N);  // [N*C, t_len*pt, h_len*ph, w_len*pw]
        return x;
    }
 }  // namespace DiT
 #endif  // __COMMON_DIT_HPP__
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@ -1675,22 +1675,13 @@ struct WeightAdapter {
 };
 struct GGMLRunnerContext {
-    ggml_backend_t backend                                       = nullptr;
+    ggml_backend_t backend                        = nullptr;
-    ggml_context* ggml_ctx                                       = nullptr;
+    ggml_context* ggml_ctx                        = nullptr;
-    bool flash_attn_enabled                                      = false;
+    bool flash_attn_enabled                       = false;
-    bool conv2d_direct_enabled                                   = false;
+    bool conv2d_direct_enabled                    = false;
-    bool circular_x_enabled                                      = false;
+    bool circular_x_enabled                       = false;
-    bool circular_y_enabled                                      = false;
+    bool circular_y_enabled                       = false;
-    std::shared_ptr<WeightAdapter> weight_adapter                = nullptr;
+    std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
    std::unordered_map<ggml_tensor*, std::string>* debug_tensors = nullptr;
    void capture_tensor(const std::string& name, ggml_tensor* tensor) {
        if (debug_tensors == nullptr || tensor == nullptr) {
            return;
        }
        ggml_set_output(tensor);
        (*debug_tensors)[tensor] = name;
    }
 };
 struct GGMLRunner {
@ -1722,7 +1713,6 @@ protected:
    std::map<ggml_tensor*, const void*> backend_tensor_data_map;
    std::map<std::string, ggml_tensor*> cache_tensor_map;  // name -> tensor
    std::unordered_map<ggml_tensor*, std::string> debug_tensors;
    const std::string final_result_name = "ggml_runner_final_result_tensor";
    bool flash_attn_enabled    = false;
@ -1809,7 +1799,6 @@ protected:
    }
    void free_compute_ctx() {
        debug_tensors.clear();
        if (compute_ctx != nullptr) {
            ggml_free(compute_ctx);
            compute_ctx = nullptr;
@ -1845,11 +1834,6 @@ protected:
            auto result = ggml_graph_node(gf, -1);
            ggml_set_name(result, final_result_name.c_str());
        }
        for (const auto& entry : debug_tensors) {
            if (entry.first != nullptr) {
                ggml_build_forward_expand(gf, entry.first);
            }
        }
        prepare_build_in_tensor_after(gf);
        return gf;
    }
@ -1919,21 +1903,6 @@ protected:
        for (auto& kv : backend_tensor_data_map) {
            auto tensor = kv.first;
            auto data   = kv.second;
            if (tensor == nullptr || data == nullptr) {
                continue;
            }
            const char* name = ggml_get_name(tensor);
            if (tensor->buffer == nullptr) {
                LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
                         get_desc().c_str(),
                         name != nullptr ? name : "",
                         (long long)tensor->ne[0],
                         (long long)tensor->ne[1],
                         (long long)tensor->ne[2],
                         (long long)tensor->ne[3],
                         ggml_type_name(tensor->type));
                continue;
            }
            ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
        }
@ -2056,7 +2025,6 @@ public:
        runner_ctx.circular_x_enabled    = circular_x_enabled;
        runner_ctx.circular_y_enabled    = circular_y_enabled;
        runner_ctx.weight_adapter        = weight_adapter;
        runner_ctx.debug_tensors         = &debug_tensors;
        return runner_ctx;
    }
@ -2195,21 +2163,6 @@ public:
            LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
            return std::nullopt;
        }
        for (const auto& entry : debug_tensors) {
            auto tensor = entry.first;
            if (tensor == nullptr) {
                continue;
            }
            if (tensor->type != GGML_TYPE_F32) {
                LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s",
                         get_desc().c_str(),
                         entry.second.c_str(),
                         ggml_type_name(tensor->type));
                continue;
            }
            auto debug_tensor = sd::make_sd_tensor_from_ggml<float>(tensor);
            print_sd_tensor(debug_tensor, false, entry.second.c_str());
        }
        copy_cache_tensors_to_cache_buffer();
        auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
        std::optional<sd::Tensor<T>> output;
--- a/src/ltx_vae.hpp
+++ b/src/ltx_vae.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_LTX_VAE_HPP__
+#ifndef __SD_LTX_VAE_H__
-#define __SD_LTX_VAE_HPP__
+#define __SD_LTX_VAE_H__
 #include <fstream>
 #include <memory>
@ -107,20 +107,20 @@ namespace LTXVAE {
            auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);
            if (causal) {
-                auto first_frame     = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
+                auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
                auto first_frame_pad = first_frame;
                for (int i = 1; i < time_kernel_size - 1; i++) {
                    first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
                }
                x = ggml_concat(ctx->ggml_ctx, first_frame_pad, x, 2);
            } else {
-                auto first_frame     = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
+                auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
                auto first_frame_pad = first_frame;
                for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
                    first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
                }
-                auto last_frame     = ggml_ext_slice(ctx->ggml_ctx, x, 2, x->ne[2] - 1, x->ne[2]);
+                auto last_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, x->ne[2] - 1, x->ne[2]);
                auto last_frame_pad = last_frame;
                for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
                    last_frame_pad = ggml_concat(ctx->ggml_ctx, last_frame_pad, last_frame, 2);
@ -175,7 +175,7 @@ namespace LTXVAE {
    public:
        ResnetBlock3D(int64_t channels,
-                      float eps                  = 1e-6f,
+                    float eps = 1e-6f,
                      bool timestep_conditioning = false)
            : channels(channels), timestep_conditioning(timestep_conditioning) {
            blocks["norm1"] = std::make_shared<PixelNorm3D>(eps);
@ -333,9 +333,9 @@ namespace LTXVAE {
            const int64_t factor = static_cast<int64_t>(factor_t) * static_cast<int64_t>(factor_s) * static_cast<int64_t>(factor_s);
            GGML_ASSERT(out_channels % factor == 0);
-            blocks["conv"]            = std::make_shared<CausalConv3d>(in_channels, out_channels / factor, 3);
+            blocks["conv"]             = std::make_shared<CausalConv3d>(in_channels, out_channels / factor, 3);
-            blocks["skip_downsample"] = std::make_shared<WAN::AvgDown3D>(in_channels, out_channels, factor_t, factor_s);
+            blocks["skip_downsample"]  = std::make_shared<WAN::AvgDown3D>(in_channels, out_channels, factor_t, factor_s);
-            blocks["conv_downsample"] = std::make_shared<WAN::AvgDown3D>(out_channels / factor, out_channels, factor_t, factor_s);
+            blocks["conv_downsample"]  = std::make_shared<WAN::AvgDown3D>(out_channels / factor, out_channels, factor_t, factor_s);
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx,
@ -530,16 +530,16 @@ namespace LTXVAE {
        int64_t latent_channels;
        Encoder(int version,
-                int patch_size          = 4,
+                int patch_size      = 4,
-                int64_t in_channels     = 3,
+                int64_t in_channels = 3,
                int64_t latent_channels = 128)
            : version(version),
              patch_size(patch_size),
              in_channels(in_channels),
              latent_channels(latent_channels) {
-            auto cfg         = get_encoder_config(version);
+            auto cfg          = get_encoder_config(version);
-            int64_t channels = 128;
+            int64_t channels  = 128;
-            int64_t in_dim   = in_channels * patch_size * patch_size;
+            int64_t in_dim    = in_channels * patch_size * patch_size;
            blocks["conv_in"] = std::make_shared<CausalConv3d>(in_dim, channels, 3);
@ -547,29 +547,29 @@ namespace LTXVAE {
                const auto& block = cfg.blocks[block_idx];
                if (block.type == "res_x") {
                    blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<UNetMidBlock3D>(channels,
-                                                                                                          block.num_layers,
+                                                                                                           block.num_layers,
-                                                                                                          false);
+                                                                                                           false);
                } else if (block.type == "compress_space_res") {
-                    int64_t next_channels                              = channels * block.multiplier;
+                    int64_t next_channels = channels * block.multiplier;
                    blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
-                                                                                                                  next_channels,
+                                                                                                                   next_channels,
-                                                                                                                  1,
+                                                                                                                   1,
-                                                                                                                  2);
+                                                                                                                   2);
-                    channels                                           = next_channels;
+                    channels = next_channels;
                } else if (block.type == "compress_time_res") {
-                    int64_t next_channels                              = channels * block.multiplier;
+                    int64_t next_channels = channels * block.multiplier;
                    blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
-                                                                                                                  next_channels,
+                                                                                                                   next_channels,
-                                                                                                                  2,
+                                                                                                                   2,
-                                                                                                                  1);
+                                                                                                                   1);
-                    channels                                           = next_channels;
+                    channels = next_channels;
                } else if (block.type == "compress_all_res") {
-                    int64_t next_channels                              = channels * block.multiplier;
+                    int64_t next_channels = channels * block.multiplier;
                    blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
-                                                                                                                  next_channels,
+                                                                                                                   next_channels,
-                                                                                                                  2,
+                                                                                                                   2,
-                                                                                                                  2);
+                                                                                                                   2);
-                    channels                                           = next_channels;
+                    channels = next_channels;
                } else {
                    GGML_ABORT("Unsupported LTX VAE encoder block");
                }
@ -775,7 +775,7 @@ namespace LTXVAE {
            auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
            auto latents   = processor->un_normalize(ctx, z);
            auto out       = decoder->forward(ctx, latents, timestep);
-            out            = WAN::WanVAE::unpatchify(ctx->ggml_ctx, out, patch_size, 1);
+            out          = WAN::WanVAE::unpatchify(ctx->ggml_ctx, out, patch_size, 1);
            return out;
        }
@ -936,8 +936,7 @@ struct LTXVideoVAE : public VAE {
    static void load_from_file_and_test(const std::string& model_path,
                                        const std::string& input_path) {
-        // ggml_backend_t backend = ggml_backend_cuda_init(0);
+        ggml_backend_t backend = ggml_backend_cuda_init(0);
        ggml_backend_t backend = ggml_backend_cpu_init();
        LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
        ModelLoader model_loader;
@ -968,4 +967,4 @@ struct LTXVideoVAE : public VAE {
    }
 };
-#endif  // __SD_LTX_VAE_HPP__
+#endif  // __SD_LTX_VAE_H__
--- a/src/ltx_vae_test.cpp
+++ b/src/ltx_vae_test.cpp
@ -0,0 +1,8 @@
 #include "ltx_vae_test.h"
 #include "ltx_vae.h"
 void ltx_vae_load_from_file_and_test(const std::string& model_path,
                                     const std::string& input_path) {
    LTXVideoVAE::load_from_file_and_test(model_path, input_path);
 }
--- a/src/ltx_vae_test.h
+++ b/src/ltx_vae_test.h
@ -0,0 +1,9 @@
 #ifndef __SD_LTX_VAE_TEST_H__
 #define __SD_LTX_VAE_TEST_H__
 #include <string>
 void ltx_vae_load_from_file_and_test(const std::string& model_path,
                                     const std::string& input_path);
 #endif  // __SD_LTX_VAE_TEST_H__
--- a/src/ltxv.hpp
+++ b/src/ltxv.hpp
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -14,7 +14,7 @@
 #include "diffusion_model.hpp"
 #include "esrgan.hpp"
 #include "lora.hpp"
-#include "ltx_vae.hpp"
+#include "ltx_vae.h"
 #include "pmid.hpp"
 #include "sample-cache.h"
 #include "tae.hpp"
@ -2966,10 +2966,10 @@ static sd::Tensor<float> pack_ltxav_audio_and_video_latents(const sd::Tensor<flo
 static int get_ltxav_num_audio_latents(int frames, int fps) {
    GGML_ASSERT(frames > 0);
    GGML_ASSERT(fps > 0);
-    constexpr float kSampleRate            = 16000.0f;
+    constexpr float kSampleRate                 = 16000.0f;
-    constexpr float kMelHopLength          = 160.0f;
+    constexpr float kMelHopLength               = 160.0f;
-    constexpr float kAudioLatentDownsample = 4.0f;
+    constexpr float kAudioLatentDownsample      = 4.0f;
-    constexpr float kLatentsPerSecond      = kSampleRate / kMelHopLength / kAudioLatentDownsample;
+    constexpr float kLatentsPerSecond           = kSampleRate / kMelHopLength / kAudioLatentDownsample;
    return static_cast<int>(std::ceil((static_cast<float>(frames) / static_cast<float>(fps)) * kLatentsPerSecond));
 }
@ -3742,8 +3742,8 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
    }
    if (sd_version_is_ltxav(sd_ctx->sd->version)) {
-        latents.audio_length = 0;
+        latents.audio_length = get_ltxav_num_audio_latents(request->frames, request->fps);
-        latents.audio_latent = {};
+        latents.audio_latent = sd::zeros<float>({16, latents.audio_length, 8, 1});
    }
    if (sd_version_is_ltxav(sd_ctx->sd->version)) {
@ -3923,8 +3923,9 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
        latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
    }
-    // Pipeline-level audio support is temporarily disabled. Keep the model-side
+    if (!latents.audio_latent.empty()) {
-    // AV implementation intact, but feed pure video latents through vid_gen.
+        latents.init_latent = pack_ltxav_audio_and_video_latents(latents.init_latent, latents.audio_latent);
    }
    return latents;
 }
--- a/src/tokenizers/vocab/gemma_merges.hpp
+++ b/src/tokenizers/vocab/gemma_merges.hpp
--- a/src/tokenizers/vocab/gemma_vocab.hpp
+++ b/src/tokenizers/vocab/gemma_vocab.hpp