wip

2026-05-08 16:28:53 +00:00 · 2026-04-27 21:43:22 +08:00
10 changed files with 326 additions and 702 deletions
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -19,6 +19,8 @@
 #include "common/media_io.h"
 #include "common/resource_owners.hpp"
 #include "image_metadata.h"
+#include "llm.hpp"
+#include "ltx_vae_test.h"

 namespace fs = std::filesystem;

@ -500,6 +502,27 @@ int main(int argc, const char* argv[]) {
    SDContextParams ctx_params;
    SDGenerationParams gen_params;

+    cli_params.verbose = true;
+    sd_set_log_callback(sd_log_cb, (void*)&cli_params);
+    {
+        const bool run_ltx_vae_test  = false;
+        const std::string model_path = "E:/Code/ComfyUI/models/vae/ltx-2.3-22b-dev_video_vae.safetensors";
+        const std::string input_path = "E:/Code/sd.cpp/build/ltx_vae_z.bin";
+        if (run_ltx_vae_test) {
+            ltx_vae_load_from_file_and_test(model_path, input_path);
+            return 0;
+        }
+    }
+
+    // cli_params.verbose = true;
+    // sd_set_log_callback(sd_log_cb, (void*)&cli_params);
+    // GemmaTokenizer tokenizer;
+    // auto tokens = tokenizer.tokenize("<html> 一只可爱的小猫");
+    // for (auto token : tokens) {
+    //     LOG_INFO("%d", token);
+    // }
+    // return 0;
+
    parse_args(argc, argv, cli_params, ctx_params, gen_params);
    sd_set_log_callback(sd_log_cb, (void*)&cli_params);
    log_verbose = cli_params.verbose;
--- a/src/common_dit.hpp
+++ b/src/common_dit.hpp
@ -103,64 +103,6 @@ namespace DiT {
        x         = ggml_ext_slice(ctx, x, 0, 0, W);               // [N, C, H, W]
        return x;
    }
-
-    inline ggml_tensor* patchify(ggml_context* ctx,
-                                 ggml_tensor* x,
-                                 int pt,
-                                 int ph,
-                                 int pw,
-                                 int64_t N = 1) {
-        // x: [N*C, T, H, W]
-        // return: [N, h*w, C*pt*ph*pw]
-        int64_t C     = x->ne[3] / N;
-        int64_t T     = x->ne[2];
-        int64_t H     = x->ne[1];
-        int64_t W     = x->ne[0];
-        int64_t t_len = T / pt;
-        int64_t h_len = H / ph;
-        int64_t w_len = W / pw;
-
-        GGML_ASSERT(C * N == x->ne[3]);
-        GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
-
-        x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N);      // [N*C*t_len, pt, h_len*ph, w_len*pw]
-        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N*C*t_len, h_len*ph, pt, w_len*pw]
-        x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N);      // [N*C*t_len*h_len, ph, pt, w_len*pw]
-        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N*C*t_len*h_len, pt, ph, w_len*pw]
-        x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N);      // [N*C*t_len*h_len, pt*ph, w_len, pw]
-        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N*C*t_len*h_len, w_len, pt*ph, pw]
-        x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N);      // [N, C, t_len*h_len*w_len, pt*ph*pw]
-        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N, t_len*h_len*w_len, C, pt*ph*pw]
-        x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1);  // [N, t_len*h_len*w_len, C*pt*ph*pw]
-        return x;
-    }
-
-    inline ggml_tensor* unpatchify(ggml_context* ctx,
-                                   ggml_tensor* x,
-                                   int64_t t_len,
-                                   int64_t h_len,
-                                   int64_t w_len,
-                                   int pt,
-                                   int ph,
-                                   int pw) {
-        // x: [N, t_len*h_len*w_len, pt*ph*pw*C]
-        // return: [N*C, t_len*pt, h_len*ph, w_len*pw]
-        int64_t N = x->ne[3];
-        int64_t C = x->ne[0] / pt / ph / pw;
-
-        GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
-
-        x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N);  // [N, t_len*h_len*w_len, pt*ph*pw, C]
-        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3));      // [N, C, t_len*h_len*w_len, pt*ph*pw]
-        x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N);  // [N*C*t_len*h_len, w_len, pt*ph, pw]
-        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));      // [N*C*t_len*h_len, pt*ph, w_len, pw]
-        x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N);  // [N*C*t_len*h_len, pt, ph, w_len*pw]
-        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));      // [N*C*t_len*h_len, ph, pt, w_len*pw]
-        x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N);  // [N*C*t_len, h_len*ph, pt, w_len*pw]
-        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));      // [N*C*t_len, pt, h_len*ph, w_len*pw]
-        x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N);  // [N*C, t_len*pt, h_len*ph, w_len*pw]
-        return x;
-    }
 }  // namespace DiT

 #endif  // __COMMON_DIT_HPP__
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@ -1682,15 +1682,6 @@ struct GGMLRunnerContext {
    bool circular_x_enabled                       = false;
    bool circular_y_enabled                       = false;
    std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
-    std::unordered_map<ggml_tensor*, std::string>* debug_tensors = nullptr;
-
-    void capture_tensor(const std::string& name, ggml_tensor* tensor) {
-        if (debug_tensors == nullptr || tensor == nullptr) {
-            return;
-        }
-        ggml_set_output(tensor);
-        (*debug_tensors)[tensor] = name;
-    }
 };

 struct GGMLRunner {
@ -1722,7 +1713,6 @@ protected:

    std::map<ggml_tensor*, const void*> backend_tensor_data_map;
    std::map<std::string, ggml_tensor*> cache_tensor_map;  // name -> tensor
-    std::unordered_map<ggml_tensor*, std::string> debug_tensors;
    const std::string final_result_name = "ggml_runner_final_result_tensor";

    bool flash_attn_enabled    = false;
@ -1809,7 +1799,6 @@ protected:
    }

    void free_compute_ctx() {
-        debug_tensors.clear();
        if (compute_ctx != nullptr) {
            ggml_free(compute_ctx);
            compute_ctx = nullptr;
@ -1845,11 +1834,6 @@ protected:
            auto result = ggml_graph_node(gf, -1);
            ggml_set_name(result, final_result_name.c_str());
        }
-        for (const auto& entry : debug_tensors) {
-            if (entry.first != nullptr) {
-                ggml_build_forward_expand(gf, entry.first);
-            }
-        }
        prepare_build_in_tensor_after(gf);
        return gf;
    }
@ -1919,21 +1903,6 @@ protected:
        for (auto& kv : backend_tensor_data_map) {
            auto tensor = kv.first;
            auto data   = kv.second;
-            if (tensor == nullptr || data == nullptr) {
-                continue;
-            }
-            const char* name = ggml_get_name(tensor);
-            if (tensor->buffer == nullptr) {
-                LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
-                         get_desc().c_str(),
-                         name != nullptr ? name : "",
-                         (long long)tensor->ne[0],
-                         (long long)tensor->ne[1],
-                         (long long)tensor->ne[2],
-                         (long long)tensor->ne[3],
-                         ggml_type_name(tensor->type));
-                continue;
-            }

            ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
        }
@ -2056,7 +2025,6 @@ public:
        runner_ctx.circular_x_enabled    = circular_x_enabled;
        runner_ctx.circular_y_enabled    = circular_y_enabled;
        runner_ctx.weight_adapter        = weight_adapter;
-        runner_ctx.debug_tensors         = &debug_tensors;
        return runner_ctx;
    }

@ -2195,21 +2163,6 @@ public:
            LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
            return std::nullopt;
        }
-        for (const auto& entry : debug_tensors) {
-            auto tensor = entry.first;
-            if (tensor == nullptr) {
-                continue;
-            }
-            if (tensor->type != GGML_TYPE_F32) {
-                LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s",
-                         get_desc().c_str(),
-                         entry.second.c_str(),
-                         ggml_type_name(tensor->type));
-                continue;
-            }
-            auto debug_tensor = sd::make_sd_tensor_from_ggml<float>(tensor);
-            print_sd_tensor(debug_tensor, false, entry.second.c_str());
-        }
        copy_cache_tensors_to_cache_buffer();
        auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
        std::optional<sd::Tensor<T>> output;
--- a/src/ltx_vae.hpp
+++ b/src/ltx_vae.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_LTX_VAE_HPP__
-#define __SD_LTX_VAE_HPP__
+#ifndef __SD_LTX_VAE_H__
+#define __SD_LTX_VAE_H__

 #include <fstream>
 #include <memory>
@ -936,8 +936,7 @@ struct LTXVideoVAE : public VAE {

    static void load_from_file_and_test(const std::string& model_path,
                                        const std::string& input_path) {
-        // ggml_backend_t backend = ggml_backend_cuda_init(0);
-        ggml_backend_t backend = ggml_backend_cpu_init();
+        ggml_backend_t backend = ggml_backend_cuda_init(0);
        LOG_INFO("loading ltx vae from '%s'", model_path.c_str());

        ModelLoader model_loader;
@ -968,4 +967,4 @@ struct LTXVideoVAE : public VAE {
    }
 };

-#endif  // __SD_LTX_VAE_HPP__
+#endif  // __SD_LTX_VAE_H__
--- a/src/ltx_vae_test.cpp
+++ b/src/ltx_vae_test.cpp
@ -0,0 +1,8 @@
+#include "ltx_vae_test.h"
+
+#include "ltx_vae.h"
+
+void ltx_vae_load_from_file_and_test(const std::string& model_path,
+                                     const std::string& input_path) {
+    LTXVideoVAE::load_from_file_and_test(model_path, input_path);
+}
--- a/src/ltx_vae_test.h
+++ b/src/ltx_vae_test.h
@ -0,0 +1,9 @@
+#ifndef __SD_LTX_VAE_TEST_H__
+#define __SD_LTX_VAE_TEST_H__
+
+#include <string>
+
+void ltx_vae_load_from_file_and_test(const std::string& model_path,
+                                     const std::string& input_path);
+
+#endif  // __SD_LTX_VAE_TEST_H__
--- a/src/ltxv.hpp
+++ b/src/ltxv.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_LTXV_HPP__
-#define __SD_LTXV_HPP__
+#ifndef __LTXV_HPP__
+#define __LTXV_HPP__

 #include <algorithm>
 #include <cmath>
@ -79,30 +79,6 @@ namespace LTXV {
        return out;
    }

-    __STATIC_INLINE__ std::vector<double> generate_freq_grid_double(double theta,
-                                                                    int positional_dims,
-                                                                    int dim) {
-        const int n_elem     = 2 * positional_dims;
-        const int freq_count = dim / n_elem;
-
-        std::vector<double> out(freq_count);
-        if (freq_count <= 0) {
-            return out;
-        }
-        if (freq_count == 1) {
-            out[0] = 1.5707963267948966;
-            return out;
-        }
-
-        const double half_pi   = 1.5707963267948966;
-        const double log_theta = std::log(theta);
-        for (int i = 0; i < freq_count; i++) {
-            double ratio = static_cast<double>(i) / static_cast<double>(freq_count - 1);
-            out[i]       = std::exp(log_theta * ratio) * half_pi;
-        }
-        return out;
-    }
-
    __STATIC_INLINE__ std::vector<float> build_rope_matrix_from_frequencies(
        const std::vector<std::vector<float>>& frequencies,
        int dim) {
@ -126,43 +102,16 @@ namespace LTXV {
        return out;
    }

-    __STATIC_INLINE__ std::vector<std::vector<float>> split_frequencies_by_heads(
-        const std::vector<std::vector<float>>& frequencies,
-        int inner_dim,
-        int num_heads) {
-        GGML_ASSERT(num_heads > 0);
-        GGML_ASSERT(inner_dim % num_heads == 0);
-        const int inner_half_dim    = inner_dim / 2;
-        const int per_head_half_dim = inner_half_dim / num_heads;
-        GGML_ASSERT(inner_half_dim % num_heads == 0);
-
-        std::vector<std::vector<float>> out(
-            frequencies.size() * static_cast<size_t>(num_heads),
-            std::vector<float>(per_head_half_dim, 0.f));
-
-        for (size_t token = 0; token < frequencies.size(); token++) {
-            GGML_ASSERT(static_cast<int>(frequencies[token].size()) == inner_half_dim);
-            for (int head = 0; head < num_heads; head++) {
-                auto& dst = out[token * static_cast<size_t>(num_heads) + static_cast<size_t>(head)];
-                std::copy_n(frequencies[token].begin() + head * per_head_half_dim, per_head_half_dim, dst.begin());
-            }
-        }
-        return out;
-    }
-
    __STATIC_INLINE__ std::vector<float> build_video_rope_matrix(int64_t width,
                                                                 int64_t height,
                                                                 int64_t frames,
                                                                 int dim,
-                                                                 int num_heads                                      = 1,
                                                                 float frame_rate                                   = 25.f,
                                                                 float theta                                        = 10000.f,
                                                                 const std::vector<int>& max_pos                    = {20, 2048, 2048},
                                                                 const std::tuple<int, int, int>& vae_scale_factors = {8, 32, 32},
-                                                                 bool causal_temporal_positioning                   = false,
-                                                                 bool use_middle_indices_grid                       = false) {
+                                                                 bool causal_temporal_positioning                   = false) {
        GGML_ASSERT(max_pos.size() == 3);
-        GGML_ASSERT(dim % num_heads == 0);
        const std::vector<float> indices = generate_freq_grid(theta, 3, dim);
        const int half_dim               = dim / 2;
        const int pad_size               = half_dim - static_cast<int>(indices.size()) * 3;
@ -180,25 +129,11 @@ namespace LTXV {
                pixel_t = std::max(0.f, pixel_t + 1.f - scale_t);
            }
            pixel_t /= frame_rate;
-            if (use_middle_indices_grid) {
-                float end = static_cast<float>((t + 1) * scale_t);
-                if (causal_temporal_positioning) {
-                    end = std::max(0.f, end + 1.f - scale_t);
-                }
-                end /= frame_rate;
-                pixel_t = 0.5f * (pixel_t + end);
-            }

            for (int64_t h = 0; h < height; h++) {
                float pixel_h = static_cast<float>(h * scale_h);
-                if (use_middle_indices_grid) {
-                    pixel_h += 0.5f * static_cast<float>(scale_h);
-                }
                for (int64_t w = 0; w < width; w++) {
                    float pixel_w = static_cast<float>(w * scale_w);
-                    if (use_middle_indices_grid) {
-                        pixel_w += 0.5f * static_cast<float>(scale_w);
-                    }

                    int out_idx = 0;
                    for (int i = 0; i < pad_size; i++) {
@ -211,6 +146,13 @@ namespace LTXV {
                        pixel_w / max_pos[2],
                    };

+                    // Match ComfyUI generate_freqs():
+                    //   (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
+                    //       .transpose(-1, -2)
+                    //       .flatten(2)
+                    // After the transpose, the half-dim order is:
+                    //   [t_f0, h_f0, w_f0, t_f1, h_f1, w_f1, ...]
+                    // not [t_f0, t_f1, ..., h_f0, h_f1, ..., w_f0, w_f1, ...].
                    for (float index : indices) {
                        for (int axis = 0; axis < 3; axis++) {
                            freqs[token][out_idx++] = index * (coords[axis] * 2.f - 1.f);
@ -221,24 +163,16 @@ namespace LTXV {
            }
        }

-        if (num_heads > 1) {
-            return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
-        }
        return build_rope_matrix_from_frequencies(freqs, dim);
    }

    __STATIC_INLINE__ std::vector<float> build_1d_rope_matrix(int64_t seq_len,
                                                              int dim,
-                                                              int num_heads          = 1,
                                                              float theta            = 10000.f,
-                                                              float positional_scale = 4096.f,
-                                                              bool double_precision  = false) {
-        GGML_ASSERT(dim % num_heads == 0);
-        const std::vector<float> indices = double_precision ? std::vector<float>() : generate_freq_grid(theta, 1, dim);
-        const std::vector<double> indices_d =
-            double_precision ? generate_freq_grid_double(static_cast<double>(theta), 1, dim) : std::vector<double>();
+                                                              float positional_scale = 4096.f) {
+        const std::vector<float> indices = generate_freq_grid(theta, 1, dim);
        const int half_dim               = dim / 2;
-        const int pad_size = half_dim - static_cast<int>(double_precision ? indices_d.size() : indices.size());
+        const int pad_size               = half_dim - static_cast<int>(indices.size());

        std::vector<std::vector<float>> freqs(static_cast<size_t>(seq_len), std::vector<float>(half_dim, 0.f));
        for (int64_t pos = 0; pos < seq_len; pos++) {
@ -247,39 +181,20 @@ namespace LTXV {
                freqs[static_cast<size_t>(pos)][out_idx++] = 0.f;
            }

-            if (double_precision) {
-                double coord = static_cast<double>(pos) / static_cast<double>(positional_scale);
-                for (double index : indices_d) {
-                    freqs[static_cast<size_t>(pos)][out_idx++] = static_cast<float>(index * (coord * 2.0 - 1.0));
-                }
-            } else {
            float coord = static_cast<float>(pos) / positional_scale;
            for (float index : indices) {
                freqs[static_cast<size_t>(pos)][out_idx++] = index * (coord * 2.f - 1.f);
            }
        }
-        }

-        if (num_heads > 1) {
-            return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
-        }
        return build_rope_matrix_from_frequencies(freqs, dim);
    }

    __STATIC_INLINE__ ggml_tensor* apply_hidden_rope(ggml_context* ctx,
                                                     ggml_tensor* x,
                                                     ggml_tensor* pe,
-                                                     int64_t heads,
-                                                     int64_t dim_head,
                                                     bool rope_interleaved) {
-        GGML_ASSERT(x->ne[0] == heads * dim_head);
-        auto x4 = ggml_reshape_4d(ctx, x, dim_head, heads, x->ne[1], x->ne[2]);
-        if (pe != nullptr && pe->ne[3] == x->ne[1] * heads) {
-            auto x_flat   = ggml_reshape_4d(ctx, x4, dim_head, 1, x->ne[1] * heads, x->ne[2]);
-            auto out_flat = Rope::apply_rope(ctx, x_flat, pe, rope_interleaved);
-            auto out4     = ggml_reshape_4d(ctx, out_flat, dim_head, heads, x->ne[1], x->ne[2]);
-            return ggml_reshape_3d(ctx, out4, heads * dim_head, x->ne[1], x->ne[2]);
-        }
+        auto x4 = ggml_reshape_4d(ctx, x, x->ne[0], 1, x->ne[1], x->ne[2]);
        return Rope::apply_rope(ctx, x4, pe, rope_interleaved);
    }

@ -423,8 +338,8 @@ namespace LTXV {
                if (k_pe == nullptr) {
                    k_pe = pe;
                }
-                q = apply_hidden_rope(ctx->ggml_ctx, q, pe, heads, dim_head, rope_interleaved);
-                k = apply_hidden_rope(ctx->ggml_ctx, k, k_pe, heads, dim_head, rope_interleaved);
+                q = apply_hidden_rope(ctx->ggml_ctx, q, pe, rope_interleaved);
+                k = apply_hidden_rope(ctx->ggml_ctx, k, k_pe, rope_interleaved);
            }

            auto out = ggml_ext_attention_ext(ctx->ggml_ctx,
@ -500,7 +415,7 @@ namespace LTXV {
            s             = ggml_repeat(ctx->ggml_ctx, s, e);
            t             = ggml_repeat(ctx->ggml_ctx, t, e);
            auto out      = ggml_add(ctx->ggml_ctx, s, t);
-            return ggml_ext_chunk(ctx->ggml_ctx, out, static_cast<int>(coeff), 1);
+            return ggml_ext_chunk(ctx->ggml_ctx, out, coeff, 1);
        }

        std::vector<ggml_tensor*> get_prompt_scale_shift_values(GGMLRunnerContext* ctx,
@ -694,7 +609,7 @@ namespace LTXV {
        float positional_embedding_theta              = 10000.f;
        std::vector<int> positional_embedding_max_pos = {20, 2048, 2048};
        std::tuple<int, int, int> vae_scale_factors   = {8, 32, 32};
-        bool causal_temporal_positioning              = true;
+        bool causal_temporal_positioning              = false;
        float timestep_scale_multiplier               = 1000.f;

        int64_t audio_in_channels                           = 128;
@ -726,14 +641,11 @@ namespace LTXV {
        bool audio_connector_rope_interleaved      = false;
        bool audio_connector_apply_gated_attention = false;

-        bool video_rope_interleaved  = false;
-        bool use_middle_indices_grid = true;
+        bool video_rope_interleaved = true;
        bool cross_attention_adaln  = false;

        bool use_caption_projection       = true;
        bool use_audio_caption_projection = true;
-        bool caption_proj_before_connector   = true;
-        bool caption_projection_first_linear = false;

        bool self_attention_gated  = false;
        bool cross_attention_gated = false;
@ -758,16 +670,11 @@ namespace LTXV {

    __STATIC_INLINE__ std::vector<float> build_1d_rope_matrix_from_coords(const std::vector<float>& coords,
                                                                          int dim,
-                                                                          int num_heads         = 1,
                                                                          float theta = 10000.f,
-                                                                          float max_pos         = 20.f,
-                                                                          bool double_precision = false) {
-        GGML_ASSERT(dim % num_heads == 0);
-        const std::vector<float> indices = double_precision ? std::vector<float>() : generate_freq_grid(theta, 1, dim);
-        const std::vector<double> indices_d =
-            double_precision ? generate_freq_grid_double(static_cast<double>(theta), 1, dim) : std::vector<double>();
+                                                                          float max_pos = 20.f) {
+        const std::vector<float> indices = generate_freq_grid(theta, 1, dim);
        const int half_dim               = dim / 2;
-        const int pad_size = half_dim - static_cast<int>(double_precision ? indices_d.size() : indices.size());
+        const int pad_size               = half_dim - static_cast<int>(indices.size());

        std::vector<std::vector<float>> freqs(coords.size(), std::vector<float>(half_dim, 0.f));
        for (size_t pos = 0; pos < coords.size(); pos++) {
@ -775,21 +682,11 @@ namespace LTXV {
            for (int i = 0; i < pad_size; i++) {
                freqs[pos][out_idx++] = 0.f;
            }
-            if (double_precision) {
-                double coord = static_cast<double>(coords[pos]) / static_cast<double>(max_pos);
-                for (double index : indices_d) {
-                    freqs[pos][out_idx++] = static_cast<float>(index * (coord * 2.0 - 1.0));
-                }
-            } else {
            float coord = coords[pos] / max_pos;
            for (float index : indices) {
                freqs[pos][out_idx++] = index * (coord * 2.f - 1.f);
            }
        }
-        }
-        if (num_heads > 1) {
-            return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
-        }
        return build_rope_matrix_from_frequencies(freqs, dim);
    }

@ -808,7 +705,6 @@ namespace LTXV {
                                                                          int64_t height,
                                                                          int64_t frames,
                                                                          int dim,
-                                                                          int num_heads,
                                                                          float frame_rate,
                                                                          float theta,
                                                                          int max_pos_t,
@ -829,7 +725,7 @@ namespace LTXV {
                }
            }
        }
-        return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
+        return build_1d_rope_matrix_from_coords(coords, dim, theta, static_cast<float>(max_pos_t));
    }

    __STATIC_INLINE__ float audio_latent_start_time_sec(int64_t latent_index,
@ -846,7 +742,6 @@ namespace LTXV {

    __STATIC_INLINE__ std::vector<float> build_audio_rope_matrix(int64_t seq_len,
                                                                 int dim,
-                                                                 int num_heads,
                                                                 float theta = 10000.f,
                                                                 int max_pos_t = 20,
                                                                 bool use_middle_indices_grid = false) {
@ -860,7 +755,7 @@ namespace LTXV {
                coords[static_cast<size_t>(t)] = start;
            }
        }
-        return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
+        return build_1d_rope_matrix_from_coords(coords, dim, theta, static_cast<float>(max_pos_t));
    }

    struct BasicAVTransformerBlock : public GGMLBlock {
@ -930,7 +825,7 @@ namespace LTXV {
            t      = ggml_repeat(ctx->ggml_ctx, t, e);
            s      = ggml_repeat(ctx->ggml_ctx, s, e);
            auto out = ggml_add(ctx->ggml_ctx, s, t);
-            auto chunks = ggml_ext_chunk(ctx->ggml_ctx, out, static_cast<int>(coeff), 1);
+            auto chunks = ggml_ext_chunk(ctx->ggml_ctx, out, coeff, 1);
            return std::vector<ggml_tensor*>(chunks.begin() + start, chunks.begin() + start + count);
        }

@ -1109,23 +1004,11 @@ namespace LTXV {
            blocks["av_ca_v2a_gate_adaln_single"]          = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, 1);

            if (cfg.use_caption_projection) {
-                if (cfg.caption_proj_before_connector) {
-                    if (cfg.caption_projection_first_linear) {
-                        blocks["caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.hidden_size);
-                    }
-                } else {
                blocks["caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.hidden_size, cfg.hidden_size);
            }
-            }
            if (cfg.use_audio_caption_projection) {
-                if (cfg.caption_proj_before_connector) {
-                    if (cfg.caption_projection_first_linear) {
-                        blocks["audio_caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.audio_hidden_size);
-                    }
-                } else {
                blocks["audio_caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.audio_hidden_size, cfg.audio_hidden_size);
            }
-            }

            if (cfg.use_connector) {
                blocks["video_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(cfg.connector_hidden_size,
@ -1197,97 +1080,42 @@ namespace LTXV {
        std::pair<ggml_tensor*, ggml_tensor*> preprocess_contexts(GGMLRunnerContext* ctx,
                                                                  ggml_tensor* context,
                                                                  ggml_tensor* video_connector_pe,
-                                                                  ggml_tensor* audio_connector_pe,
-                                                                  bool process_audio_context) {
+                                                                  ggml_tensor* audio_connector_pe) {
            if (context == nullptr) {
                return {nullptr, nullptr};
            }

-            bool is_fully_processed_context =
-                context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
-                context->ne[1] >= 1024;
-            bool is_unprocessed_dual_context =
-                context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
-                context->ne[1] < 1024;
-
-            if (is_fully_processed_context) {
-                auto v_context         = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
-                ggml_tensor* a_context = nullptr;
-                if (process_audio_context) {
-                    a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
-                }
-                return {v_context, a_context};
+            if (context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim) {
+                return {
+                    ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim),
+                    ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim)
+                };
            }

            ggml_tensor* v_context = context;
-            ggml_tensor* a_context = process_audio_context ? context : nullptr;
-            if (is_unprocessed_dual_context) {
-                v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
-                if (process_audio_context) {
-                    a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
-                }
-            } else if (context->ne[0] == cfg.caption_channels * 2) {
+            ggml_tensor* a_context = context;
+            if (context->ne[0] == cfg.caption_channels * 2) {
                v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.caption_channels);
-                if (process_audio_context) {
                a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.caption_channels, cfg.caption_channels * 2);
            }
-            }
-
-            if (cfg.caption_proj_before_connector) {
-                if (cfg.use_caption_projection &&
-                    blocks.count("caption_projection") > 0 &&
-                    v_context != nullptr &&
-                    v_context->ne[0] == cfg.caption_channels) {
-                    auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["caption_projection"]);
-                    if (caption_projection != nullptr) {
-                        v_context = caption_projection->forward(ctx, v_context);
-                    }
-                }
-                if (process_audio_context &&
-                    cfg.use_audio_caption_projection &&
-                    blocks.count("audio_caption_projection") > 0 &&
-                    a_context != nullptr &&
-                    a_context->ne[0] == cfg.caption_channels) {
-                    auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["audio_caption_projection"]);
-                    if (caption_projection != nullptr) {
-                        a_context = caption_projection->forward(ctx, a_context);
-                    }
-                }
-            }

            if (cfg.use_connector && v_context != nullptr && v_context->ne[0] == cfg.connector_hidden_size) {
                auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["video_embeddings_connector"]);
                v_context      = connector->forward(ctx, v_context, video_connector_pe);
            }
-            if (process_audio_context &&
-                cfg.use_audio_connector &&
-                a_context != nullptr &&
-                a_context->ne[0] == cfg.audio_connector_hidden_size) {
+            if (cfg.use_audio_connector && a_context != nullptr && a_context->ne[0] == cfg.audio_connector_hidden_size) {
                auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["audio_embeddings_connector"]);
                a_context      = connector->forward(ctx, a_context, audio_connector_pe);
            }

-            if (!cfg.caption_proj_before_connector &&
-                cfg.use_caption_projection &&
-                blocks.count("caption_projection") > 0 &&
-                v_context != nullptr &&
-                v_context->ne[0] == cfg.caption_channels) {
+            if (cfg.use_caption_projection && v_context != nullptr && v_context->ne[0] == cfg.caption_channels) {
                auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["caption_projection"]);
-                if (caption_projection != nullptr) {
                v_context               = caption_projection->forward(ctx, v_context);
            }
-            }
-            if (process_audio_context &&
-                !cfg.caption_proj_before_connector &&
-                cfg.use_audio_caption_projection &&
-                blocks.count("audio_caption_projection") > 0 &&
-                a_context != nullptr &&
-                a_context->ne[0] == cfg.caption_channels) {
+            if (cfg.use_audio_caption_projection && a_context != nullptr && a_context->ne[0] == cfg.caption_channels) {
                auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["audio_caption_projection"]);
-                if (caption_projection != nullptr) {
                a_context               = caption_projection->forward(ctx, a_context);
            }
-            }

            return {v_context, a_context};
        }
@ -1340,13 +1168,9 @@ namespace LTXV {
                ax = nullptr;
            }

-            bool run_ax    = ax != nullptr && ggml_nelements(ax) > 0 && audio_time > 0;
-            auto contexts  = preprocess_contexts(ctx, context, video_connector_pe, audio_connector_pe, run_ax);
+            auto contexts = preprocess_contexts(ctx, context, video_connector_pe, audio_connector_pe);
            auto v_context = contexts.first;
            auto a_context = contexts.second != nullptr ? contexts.second : contexts.first;
-            if (contexts.second != nullptr) {
-                a_context = ggml_cont(ctx->ggml_ctx, a_context);
-            }

            auto v_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, timestep, cfg.timestep_scale_multiplier);
            auto v_pair            = adaln_single->forward(ctx, v_timestep_scaled);
@ -1433,8 +1257,6 @@ namespace LTXV {
        std::vector<float> audio_cross_pe_vec;
        std::vector<float> connector_pe_vec;
        std::vector<float> audio_connector_pe_vec;
-        sd::Tensor<float> vx_input_cache;
-        sd::Tensor<float> ax_input_cache;

        static int64_t infer_gate_heads(const String2TensorStorage& tensor_storage_map,
                                        const std::string& bias_name,
@ -1566,7 +1388,7 @@ namespace LTXV {
            model.get_param_tensors(tensors, prefix);
        }

-        std::pair<sd::Tensor<float>, sd::Tensor<float>> split_av_latents(const sd::Tensor<float>& x_tensor,
+        std::pair<sd::Tensor<float>, sd::Tensor<float>> separate_audio_and_video_latents(const sd::Tensor<float>& x_tensor,
                                                                                          int audio_length) const {
            if (x_tensor.empty()) {
                return {{}, {}};
@ -1602,7 +1424,7 @@ namespace LTXV {
            return {vx, ax};
        }

-        ggml_tensor* merge_av_latents(ggml_context* ctx,
+        ggml_tensor* recombine_audio_and_video_latents(ggml_context* ctx,
                                                       ggml_tensor* vx,
                                                       ggml_tensor* ax) const {
            if (ax == nullptr || ggml_nelements(ax) == 0 || ax->ne[1] == 0) {
@ -1633,16 +1455,12 @@ namespace LTXV {
                                 const sd::Tensor<float>& audio_x_tensor = {},
                                 const sd::Tensor<float>& audio_timesteps_tensor = {},
                                 int audio_length = 0) {
-            auto split_inputs = split_av_latents(x_tensor, audio_length);
-            vx_input_cache    = split_inputs.first;
-            if (!audio_x_tensor.empty()) {
-                ax_input_cache = audio_x_tensor;
-            } else {
-                ax_input_cache = split_inputs.second;
-            }
+            auto split_inputs = separate_audio_and_video_latents(x_tensor, audio_length);
+            const sd::Tensor<float>& vx_tensor = split_inputs.first;
+            const sd::Tensor<float>& ax_tensor = !audio_x_tensor.empty() ? audio_x_tensor : split_inputs.second;

-            ggml_tensor* vx         = make_input(vx_input_cache);
-            ggml_tensor* ax         = make_optional_input(ax_input_cache);
+            ggml_tensor* vx         = make_input(vx_tensor);
+            ggml_tensor* ax         = make_optional_input(ax_tensor);
            ggml_tensor* timesteps  = make_input(timesteps_tensor);
            ggml_tensor* a_timestep = make_optional_input(audio_timesteps_tensor);
            ggml_tensor* context    = make_optional_input(context_tensor);
@ -1653,15 +1471,12 @@ namespace LTXV {
                                                   vx->ne[1],
                                                   vx->ne[2],
                                                   static_cast<int>(params.hidden_size),
-                                                    static_cast<int>(params.num_attention_heads),
-                                                    24.f,
+                                                   25.f,
                                                   params.positional_embedding_theta,
                                                   params.positional_embedding_max_pos,
                                                   params.vae_scale_factors,
-                                                    params.causal_temporal_positioning,
-                                                    params.use_middle_indices_grid);
-            auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.num_attention_heads);
-            ggml_set_name(video_pe, "ltxav_video_pe");
+                                                   params.causal_temporal_positioning);
+            auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.hidden_size / 2, vx->ne[0] * vx->ne[1] * vx->ne[2]);
            set_backend_tensor_data(video_pe, video_pe_vec.data());

            ggml_tensor* audio_pe = nullptr;
@ -1670,12 +1485,10 @@ namespace LTXV {
            if (ax != nullptr && ggml_nelements(ax) > 0 && ax->ne[1] > 0) {
                audio_pe_vec = build_audio_rope_matrix(ax->ne[1],
                                                       static_cast<int>(params.audio_hidden_size),
-                                                       static_cast<int>(params.audio_num_attention_heads),
                                                       params.positional_embedding_theta,
                                                       params.audio_positional_embedding_max_pos[0],
-                                                       params.use_middle_indices_grid);
-                audio_pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
-                ggml_set_name(audio_pe, "ltxav_audio_pe");
+                                                       false);
+                audio_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_hidden_size / 2, ax->ne[1]);
                set_backend_tensor_data(audio_pe, audio_pe_vec.data());

                int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]);
@ -1683,68 +1496,43 @@ namespace LTXV {
                                                                      vx->ne[1],
                                                                      vx->ne[2],
                                                                      static_cast<int>(params.audio_cross_attention_dim),
-                                                                        static_cast<int>(params.audio_num_attention_heads),
                                                                      25.f,
                                                                      params.positional_embedding_theta,
                                                                      temporal_max_pos,
                                                                      std::get<0>(params.vae_scale_factors),
                                                                      params.causal_temporal_positioning,
                                                                      true);
-                video_cross_pe       = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.audio_num_attention_heads);
-                ggml_set_name(video_cross_pe, "ltxav_video_cross_pe");
+                video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_cross_attention_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2]);
                set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data());

                audio_cross_pe_vec = build_audio_rope_matrix(ax->ne[1],
                                                             static_cast<int>(params.audio_cross_attention_dim),
-                                                             static_cast<int>(params.audio_num_attention_heads),
                                                             params.positional_embedding_theta,
                                                             temporal_max_pos,
                                                             true);
-                audio_cross_pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
-                ggml_set_name(audio_cross_pe, "ltxav_audio_cross_pe");
+                audio_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_cross_attention_dim / 2, ax->ne[1]);
                set_backend_tensor_data(audio_cross_pe, audio_cross_pe_vec.data());
            }

-            bool needs_video_connector_pe =
-                params.use_connector &&
-                context != nullptr &&
-                (context->ne[0] == params.connector_hidden_size ||
-                 ((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
-                   context->ne[0] == params.caption_channels * 2) &&
-                  context->ne[1] < 1024));
            ggml_tensor* video_connector_pe = nullptr;
-            if (needs_video_connector_pe) {
+            if (params.use_connector && context != nullptr && context->ne[0] == params.connector_hidden_size) {
                int64_t seq_len      = context->ne[1];
                int64_t target_len   = std::max<int64_t>(1024, seq_len);
                int64_t duplications = (target_len + params.connector_num_registers - 1) / params.connector_num_registers;
                int64_t full_len     = seq_len + duplications * params.connector_num_registers - seq_len;
-                connector_pe_vec     = build_1d_rope_matrix(full_len, static_cast<int>(params.connector_hidden_size), static_cast<int>(params.connector_num_heads), 10000.f, 4096.f, true);
-                video_connector_pe   = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.connector_head_dim / 2, full_len * params.connector_num_heads);
-                ggml_set_name(video_connector_pe, "ltxav_video_connector_pe");
+                connector_pe_vec     = build_1d_rope_matrix(full_len, static_cast<int>(params.connector_hidden_size));
+                video_connector_pe   = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.connector_hidden_size / 2, full_len);
                set_backend_tensor_data(video_connector_pe, connector_pe_vec.data());
            }

-            bool run_audio_context =
-                ax != nullptr &&
-                ggml_nelements(ax) > 0 &&
-                ax->ne[1] > 0;
-            bool needs_audio_connector_pe =
-                run_audio_context &&
-                params.use_audio_connector &&
-                context != nullptr &&
-                (context->ne[0] == params.audio_connector_hidden_size ||
-                 ((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
-                   context->ne[0] == params.caption_channels * 2) &&
-                  context->ne[1] < 1024));
            ggml_tensor* audio_connector_pe = nullptr;
-            if (needs_audio_connector_pe) {
+            if (params.use_audio_connector && context != nullptr && context->ne[0] == params.audio_connector_hidden_size) {
                int64_t seq_len      = context->ne[1];
                int64_t target_len   = std::max<int64_t>(1024, seq_len);
                int64_t duplications = (target_len + params.audio_connector_num_registers - 1) / params.audio_connector_num_registers;
                int64_t full_len     = seq_len + duplications * params.audio_connector_num_registers - seq_len;
-                audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.audio_connector_hidden_size), static_cast<int>(params.audio_connector_num_heads), 10000.f, 4096.f, true);
-                audio_connector_pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_connector_head_dim / 2, full_len * params.audio_connector_num_heads);
-                ggml_set_name(audio_connector_pe, "ltxav_audio_connector_pe");
+                audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.audio_connector_hidden_size));
+                audio_connector_pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_connector_hidden_size / 2, full_len);
                set_backend_tensor_data(audio_connector_pe, audio_connector_pe_vec.data());
            }

@ -1761,7 +1549,7 @@ namespace LTXV {
                                          audio_cross_pe,
                                          video_connector_pe,
                                          audio_connector_pe);
-            auto out        = merge_av_latents(compute_ctx, out_pair.first, out_pair.second);
+            auto out = recombine_audio_and_video_latents(compute_ctx, out_pair.first, out_pair.second);
            ggml_build_forward_expand(gf, out);
            return gf;
        }
@ -1776,106 +1564,7 @@ namespace LTXV {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length);
            };
-            auto out = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
-            return out;
-        }
-
-        void test(const std::string& x_path,
-                  const std::string& timesteps_path       = "",
-                  const std::string& context_path         = "",
-                  const std::string& audio_x_path         = "",
-                  const std::string& audio_timesteps_path = "") {
-            auto x = sd::load_tensor_from_file_as_tensor<float>(x_path);
-            GGML_ASSERT(!x.empty());
-            print_sd_tensor(x, false, "ltxav_x");
-
-            sd::Tensor<float> timesteps;
-            if (!timesteps_path.empty()) {
-                timesteps = sd::load_tensor_from_file_as_tensor<float>(timesteps_path);
-            } else {
-                timesteps = sd::Tensor<float>::from_vector(std::vector<float>{1.f});
-            }
-            GGML_ASSERT(!timesteps.empty());
-            print_sd_tensor(timesteps, false, "ltxav_timesteps");
-
-            sd::Tensor<float> context;
-            if (!context_path.empty()) {
-                context = sd::load_tensor_from_file_as_tensor<float>(context_path);
-                GGML_ASSERT(!context.empty());
-                print_sd_tensor(context, false, "ltxav_context");
-            }
-
-            sd::Tensor<float> audio_x;
-            int audio_length = 0;
-            if (!audio_x_path.empty()) {
-                audio_x = sd::load_tensor_from_file_as_tensor<float>(audio_x_path);
-                GGML_ASSERT(!audio_x.empty());
-                GGML_ASSERT(audio_x.dim() >= 2);
-                audio_length = static_cast<int>(audio_x.shape()[1]);
-                print_sd_tensor(audio_x, false, "ltxav_audio_x");
-            }
-
-            sd::Tensor<float> audio_timesteps;
-            if (!audio_timesteps_path.empty()) {
-                audio_timesteps = sd::load_tensor_from_file_as_tensor<float>(audio_timesteps_path);
-                GGML_ASSERT(!audio_timesteps.empty());
-            } else if (!audio_x.empty()) {
-                audio_timesteps = timesteps;
-            }
-            if (!audio_timesteps.empty()) {
-                print_sd_tensor(audio_timesteps, false, "ltxav_audio_timesteps");
-            }
-
-            int64_t t0   = ggml_time_ms();
-            auto out_opt = compute(8, x, timesteps, context, audio_x, audio_timesteps, audio_length);
-            int64_t t1   = ggml_time_ms();
-
-            GGML_ASSERT(!out_opt.empty());
-            print_sd_tensor(out_opt, false, "ltxav_out");
-            LOG_DEBUG("ltxav test done in %lldms", t1 - t0);
-        }
-
-        static void load_from_file_and_test(const std::string& model_path,
-                                            const std::string& x_path,
-                                            const std::string& timesteps_path       = "",
-                                            const std::string& context_path         = "",
-                                            const std::string& embeddings_path      = "",
-                                            const std::string& audio_x_path         = "",
-                                            const std::string& audio_timesteps_path = "") {
-            // ggml_backend_t backend = ggml_backend_cuda_init(0);
-            ggml_backend_t backend = ggml_backend_cpu_init();
-            LOG_INFO("loading ltxav from '%s'", model_path.c_str());
-
-            ModelLoader model_loader;
-            if (!model_loader.init_from_file_and_convert_name(model_path, "model.diffusion_model.")) {
-                LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
-                return;
-            }
-            if (!embeddings_path.empty()) {
-                LOG_INFO("loading ltxav embeddings from '%s'", embeddings_path.c_str());
-                if (!model_loader.init_from_file(embeddings_path)) {
-                    LOG_ERROR("init embeddings model loader from file failed: '%s'", embeddings_path.c_str());
-                    return;
-                }
-            }
-
-            auto& tensor_storage_map           = model_loader.get_tensor_storage_map();
-            std::shared_ptr<LTXAVRunner> ltxav = std::make_shared<LTXAVRunner>(backend,
-                                                                               false,
-                                                                               tensor_storage_map,
-                                                                               "model.diffusion_model");
-
-            ltxav->alloc_params_buffer();
-            std::map<std::string, ggml_tensor*> tensors;
-            ltxav->get_param_tensors(tensors, "model.diffusion_model");
-
-            if (!model_loader.load_tensors(tensors)) {
-                LOG_ERROR("load tensors from model loader failed");
-                return;
-            }
-
-            LOG_INFO("ltxav model loaded");
-            ltxav->test(x_path, timesteps_path, context_path, audio_x_path, audio_timesteps_path);
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
        }
    };

--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -14,7 +14,7 @@
 #include "diffusion_model.hpp"
 #include "esrgan.hpp"
 #include "lora.hpp"
-#include "ltx_vae.hpp"
+#include "ltx_vae.h"
 #include "pmid.hpp"
 #include "sample-cache.h"
 #include "tae.hpp"
@ -3742,8 +3742,8 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
    }

    if (sd_version_is_ltxav(sd_ctx->sd->version)) {
-        latents.audio_length = 0;
-        latents.audio_latent = {};
+        latents.audio_length = get_ltxav_num_audio_latents(request->frames, request->fps);
+        latents.audio_latent = sd::zeros<float>({16, latents.audio_length, 8, 1});
    }

    if (sd_version_is_ltxav(sd_ctx->sd->version)) {
@ -3923,8 +3923,9 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
        latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
    }

-    // Pipeline-level audio support is temporarily disabled. Keep the model-side
-    // AV implementation intact, but feed pure video latents through vid_gen.
+    if (!latents.audio_latent.empty()) {
+        latents.init_latent = pack_ltxav_audio_and_video_latents(latents.init_latent, latents.audio_latent);
+    }

    return latents;
 }
--- a/src/tokenizers/vocab/gemma_merges.hpp
+++ b/src/tokenizers/vocab/gemma_merges.hpp
--- a/src/tokenizers/vocab/gemma_vocab.hpp
+++ b/src/tokenizers/vocab/gemma_vocab.hpp