fix ubuntu build

fix ci
change vocab file encoding
2026-05-08 16:28:53 +00:00 · 2026-04-29 01:30:04 +08:00 · 2026-04-29 01:23:20 +08:00 · 2026-04-29 01:17:45 +08:00 · 2026-04-29 01:11:15 +08:00
10 changed files with 702 additions and 326 deletions
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -19,8 +19,6 @@
 #include "common/media_io.h"
 #include "common/resource_owners.hpp"
 #include "image_metadata.h"
 #include "llm.hpp"
 #include "ltx_vae_test.h"
 namespace fs = std::filesystem;
@ -502,27 +500,6 @@ int main(int argc, const char* argv[]) {
    SDContextParams ctx_params;
    SDGenerationParams gen_params;
    cli_params.verbose = true;
    sd_set_log_callback(sd_log_cb, (void*)&cli_params);
    {
        const bool run_ltx_vae_test  = false;
        const std::string model_path = "E:/Code/ComfyUI/models/vae/ltx-2.3-22b-dev_video_vae.safetensors";
        const std::string input_path = "E:/Code/sd.cpp/build/ltx_vae_z.bin";
        if (run_ltx_vae_test) {
            ltx_vae_load_from_file_and_test(model_path, input_path);
            return 0;
        }
    }
    // cli_params.verbose = true;
    // sd_set_log_callback(sd_log_cb, (void*)&cli_params);
    // GemmaTokenizer tokenizer;
    // auto tokens = tokenizer.tokenize("<html> 一只可爱的小猫");
    // for (auto token : tokens) {
    //     LOG_INFO("%d", token);
    // }
    // return 0;
    parse_args(argc, argv, cli_params, ctx_params, gen_params);
    sd_set_log_callback(sd_log_cb, (void*)&cli_params);
    log_verbose = cli_params.verbose;
--- a/src/common_dit.hpp
+++ b/src/common_dit.hpp
@ -103,6 +103,64 @@ namespace DiT {
        x         = ggml_ext_slice(ctx, x, 0, 0, W);               // [N, C, H, W]
        return x;
    }
    inline ggml_tensor* patchify(ggml_context* ctx,
                                 ggml_tensor* x,
                                 int pt,
                                 int ph,
                                 int pw,
                                 int64_t N = 1) {
        // x: [N*C, T, H, W]
        // return: [N, h*w, C*pt*ph*pw]
        int64_t C     = x->ne[3] / N;
        int64_t T     = x->ne[2];
        int64_t H     = x->ne[1];
        int64_t W     = x->ne[0];
        int64_t t_len = T / pt;
        int64_t h_len = H / ph;
        int64_t w_len = W / pw;
        GGML_ASSERT(C * N == x->ne[3]);
        GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
        x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N);      // [N*C*t_len, pt, h_len*ph, w_len*pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N*C*t_len, h_len*ph, pt, w_len*pw]
        x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N);      // [N*C*t_len*h_len, ph, pt, w_len*pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N*C*t_len*h_len, pt, ph, w_len*pw]
        x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N);      // [N*C*t_len*h_len, pt*ph, w_len, pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N*C*t_len*h_len, w_len, pt*ph, pw]
        x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N);      // [N, C, t_len*h_len*w_len, pt*ph*pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N, t_len*h_len*w_len, C, pt*ph*pw]
        x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1);  // [N, t_len*h_len*w_len, C*pt*ph*pw]
        return x;
    }
    inline ggml_tensor* unpatchify(ggml_context* ctx,
                                   ggml_tensor* x,
                                   int64_t t_len,
                                   int64_t h_len,
                                   int64_t w_len,
                                   int pt,
                                   int ph,
                                   int pw) {
        // x: [N, t_len*h_len*w_len, pt*ph*pw*C]
        // return: [N*C, t_len*pt, h_len*ph, w_len*pw]
        int64_t N = x->ne[3];
        int64_t C = x->ne[0] / pt / ph / pw;
        GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
        x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N);  // [N, t_len*h_len*w_len, pt*ph*pw, C]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3));      // [N, C, t_len*h_len*w_len, pt*ph*pw]
        x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N);  // [N*C*t_len*h_len, w_len, pt*ph, pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));      // [N*C*t_len*h_len, pt*ph, w_len, pw]
        x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N);  // [N*C*t_len*h_len, pt, ph, w_len*pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));      // [N*C*t_len*h_len, ph, pt, w_len*pw]
        x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N);  // [N*C*t_len, h_len*ph, pt, w_len*pw]
        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));      // [N*C*t_len, pt, h_len*ph, w_len*pw]
        x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N);  // [N*C, t_len*pt, h_len*ph, w_len*pw]
        return x;
    }
 }  // namespace DiT
 #endif  // __COMMON_DIT_HPP__
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@ -1682,6 +1682,15 @@ struct GGMLRunnerContext {
    bool circular_x_enabled                                      = false;
    bool circular_y_enabled                                      = false;
    std::shared_ptr<WeightAdapter> weight_adapter                = nullptr;
    std::unordered_map<ggml_tensor*, std::string>* debug_tensors = nullptr;
    void capture_tensor(const std::string& name, ggml_tensor* tensor) {
        if (debug_tensors == nullptr || tensor == nullptr) {
            return;
        }
        ggml_set_output(tensor);
        (*debug_tensors)[tensor] = name;
    }
 };
 struct GGMLRunner {
@ -1713,6 +1722,7 @@ protected:
    std::map<ggml_tensor*, const void*> backend_tensor_data_map;
    std::map<std::string, ggml_tensor*> cache_tensor_map;  // name -> tensor
    std::unordered_map<ggml_tensor*, std::string> debug_tensors;
    const std::string final_result_name = "ggml_runner_final_result_tensor";
    bool flash_attn_enabled    = false;
@ -1799,6 +1809,7 @@ protected:
    }
    void free_compute_ctx() {
        debug_tensors.clear();
        if (compute_ctx != nullptr) {
            ggml_free(compute_ctx);
            compute_ctx = nullptr;
@ -1834,6 +1845,11 @@ protected:
            auto result = ggml_graph_node(gf, -1);
            ggml_set_name(result, final_result_name.c_str());
        }
        for (const auto& entry : debug_tensors) {
            if (entry.first != nullptr) {
                ggml_build_forward_expand(gf, entry.first);
            }
        }
        prepare_build_in_tensor_after(gf);
        return gf;
    }
@ -1903,6 +1919,21 @@ protected:
        for (auto& kv : backend_tensor_data_map) {
            auto tensor = kv.first;
            auto data   = kv.second;
            if (tensor == nullptr || data == nullptr) {
                continue;
            }
            const char* name = ggml_get_name(tensor);
            if (tensor->buffer == nullptr) {
                LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
                         get_desc().c_str(),
                         name != nullptr ? name : "",
                         (long long)tensor->ne[0],
                         (long long)tensor->ne[1],
                         (long long)tensor->ne[2],
                         (long long)tensor->ne[3],
                         ggml_type_name(tensor->type));
                continue;
            }
            ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
        }
@ -2025,6 +2056,7 @@ public:
        runner_ctx.circular_x_enabled    = circular_x_enabled;
        runner_ctx.circular_y_enabled    = circular_y_enabled;
        runner_ctx.weight_adapter        = weight_adapter;
        runner_ctx.debug_tensors         = &debug_tensors;
        return runner_ctx;
    }
@ -2163,6 +2195,21 @@ public:
            LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
            return std::nullopt;
        }
        for (const auto& entry : debug_tensors) {
            auto tensor = entry.first;
            if (tensor == nullptr) {
                continue;
            }
            if (tensor->type != GGML_TYPE_F32) {
                LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s",
                         get_desc().c_str(),
                         entry.second.c_str(),
                         ggml_type_name(tensor->type));
                continue;
            }
            auto debug_tensor = sd::make_sd_tensor_from_ggml<float>(tensor);
            print_sd_tensor(debug_tensor, false, entry.second.c_str());
        }
        copy_cache_tensors_to_cache_buffer();
        auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
        std::optional<sd::Tensor<T>> output;
--- a/src/ltx_vae.hpp
+++ b/src/ltx_vae.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_LTX_VAE_H__
+#ifndef __SD_LTX_VAE_HPP__
-#define __SD_LTX_VAE_H__
+#define __SD_LTX_VAE_HPP__
 #include <fstream>
 #include <memory>
@ -936,7 +936,8 @@ struct LTXVideoVAE : public VAE {
    static void load_from_file_and_test(const std::string& model_path,
                                        const std::string& input_path) {
-        ggml_backend_t backend = ggml_backend_cuda_init(0);
+        // ggml_backend_t backend = ggml_backend_cuda_init(0);
        ggml_backend_t backend = ggml_backend_cpu_init();
        LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
        ModelLoader model_loader;
@ -967,4 +968,4 @@ struct LTXVideoVAE : public VAE {
    }
 };
-#endif  // __SD_LTX_VAE_H__
+#endif  // __SD_LTX_VAE_HPP__
--- a/src/ltx_vae_test.cpp
+++ b/src/ltx_vae_test.cpp
@ -1,8 +0,0 @@
 #include "ltx_vae_test.h"
 #include "ltx_vae.h"
 void ltx_vae_load_from_file_and_test(const std::string& model_path,
                                     const std::string& input_path) {
    LTXVideoVAE::load_from_file_and_test(model_path, input_path);
 }
--- a/src/ltx_vae_test.h
+++ b/src/ltx_vae_test.h
@ -1,9 +0,0 @@
 #ifndef __SD_LTX_VAE_TEST_H__
 #define __SD_LTX_VAE_TEST_H__
 #include <string>
 void ltx_vae_load_from_file_and_test(const std::string& model_path,
                                     const std::string& input_path);
 #endif  // __SD_LTX_VAE_TEST_H__
--- a/src/ltxv.hpp
+++ b/src/ltxv.hpp
@ -1,5 +1,5 @@
-#ifndef __LTXV_HPP__
+#ifndef __SD_LTXV_HPP__
-#define __LTXV_HPP__
+#define __SD_LTXV_HPP__
 #include <algorithm>
 #include <cmath>
@ -79,6 +79,30 @@ namespace LTXV {
        return out;
    }
    __STATIC_INLINE__ std::vector<double> generate_freq_grid_double(double theta,
                                                                    int positional_dims,
                                                                    int dim) {
        const int n_elem     = 2 * positional_dims;
        const int freq_count = dim / n_elem;
        std::vector<double> out(freq_count);
        if (freq_count <= 0) {
            return out;
        }
        if (freq_count == 1) {
            out[0] = 1.5707963267948966;
            return out;
        }
        const double half_pi   = 1.5707963267948966;
        const double log_theta = std::log(theta);
        for (int i = 0; i < freq_count; i++) {
            double ratio = static_cast<double>(i) / static_cast<double>(freq_count - 1);
            out[i]       = std::exp(log_theta * ratio) * half_pi;
        }
        return out;
    }
    __STATIC_INLINE__ std::vector<float> build_rope_matrix_from_frequencies(
        const std::vector<std::vector<float>>& frequencies,
        int dim) {
@ -102,16 +126,43 @@ namespace LTXV {
        return out;
    }
    __STATIC_INLINE__ std::vector<std::vector<float>> split_frequencies_by_heads(
        const std::vector<std::vector<float>>& frequencies,
        int inner_dim,
        int num_heads) {
        GGML_ASSERT(num_heads > 0);
        GGML_ASSERT(inner_dim % num_heads == 0);
        const int inner_half_dim    = inner_dim / 2;
        const int per_head_half_dim = inner_half_dim / num_heads;
        GGML_ASSERT(inner_half_dim % num_heads == 0);
        std::vector<std::vector<float>> out(
            frequencies.size() * static_cast<size_t>(num_heads),
            std::vector<float>(per_head_half_dim, 0.f));
        for (size_t token = 0; token < frequencies.size(); token++) {
            GGML_ASSERT(static_cast<int>(frequencies[token].size()) == inner_half_dim);
            for (int head = 0; head < num_heads; head++) {
                auto& dst = out[token * static_cast<size_t>(num_heads) + static_cast<size_t>(head)];
                std::copy_n(frequencies[token].begin() + head * per_head_half_dim, per_head_half_dim, dst.begin());
            }
        }
        return out;
    }
    __STATIC_INLINE__ std::vector<float> build_video_rope_matrix(int64_t width,
                                                                 int64_t height,
                                                                 int64_t frames,
                                                                 int dim,
                                                                 int num_heads                                      = 1,
                                                                 float frame_rate                                   = 25.f,
                                                                 float theta                                        = 10000.f,
                                                                 const std::vector<int>& max_pos                    = {20, 2048, 2048},
                                                                 const std::tuple<int, int, int>& vae_scale_factors = {8, 32, 32},
-                                                                 bool causal_temporal_positioning                   = false) {
+                                                                 bool causal_temporal_positioning                   = false,
                                                                 bool use_middle_indices_grid                       = false) {
        GGML_ASSERT(max_pos.size() == 3);
        GGML_ASSERT(dim % num_heads == 0);
        const std::vector<float> indices = generate_freq_grid(theta, 3, dim);
        const int half_dim               = dim / 2;
        const int pad_size               = half_dim - static_cast<int>(indices.size()) * 3;
@ -129,11 +180,25 @@ namespace LTXV {
                pixel_t = std::max(0.f, pixel_t + 1.f - scale_t);
            }
            pixel_t /= frame_rate;
            if (use_middle_indices_grid) {
                float end = static_cast<float>((t + 1) * scale_t);
                if (causal_temporal_positioning) {
                    end = std::max(0.f, end + 1.f - scale_t);
                }
                end /= frame_rate;
                pixel_t = 0.5f * (pixel_t + end);
            }
            for (int64_t h = 0; h < height; h++) {
                float pixel_h = static_cast<float>(h * scale_h);
                if (use_middle_indices_grid) {
                    pixel_h += 0.5f * static_cast<float>(scale_h);
                }
                for (int64_t w = 0; w < width; w++) {
                    float pixel_w = static_cast<float>(w * scale_w);
                    if (use_middle_indices_grid) {
                        pixel_w += 0.5f * static_cast<float>(scale_w);
                    }
                    int out_idx = 0;
                    for (int i = 0; i < pad_size; i++) {
@ -146,13 +211,6 @@ namespace LTXV {
                        pixel_w / max_pos[2],
                    };
                    // Match ComfyUI generate_freqs():
                    //   (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
                    //       .transpose(-1, -2)
                    //       .flatten(2)
                    // After the transpose, the half-dim order is:
                    //   [t_f0, h_f0, w_f0, t_f1, h_f1, w_f1, ...]
                    // not [t_f0, t_f1, ..., h_f0, h_f1, ..., w_f0, w_f1, ...].
                    for (float index : indices) {
                        for (int axis = 0; axis < 3; axis++) {
                            freqs[token][out_idx++] = index * (coords[axis] * 2.f - 1.f);
@ -163,16 +221,24 @@ namespace LTXV {
            }
        }
        if (num_heads > 1) {
            return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
        }
        return build_rope_matrix_from_frequencies(freqs, dim);
    }
    __STATIC_INLINE__ std::vector<float> build_1d_rope_matrix(int64_t seq_len,
                                                              int dim,
                                                              int num_heads          = 1,
                                                              float theta            = 10000.f,
-                                                              float positional_scale = 4096.f) {
+                                                              float positional_scale = 4096.f,
-        const std::vector<float> indices = generate_freq_grid(theta, 1, dim);
+                                                              bool double_precision  = false) {
        GGML_ASSERT(dim % num_heads == 0);
        const std::vector<float> indices = double_precision ? std::vector<float>() : generate_freq_grid(theta, 1, dim);
        const std::vector<double> indices_d =
            double_precision ? generate_freq_grid_double(static_cast<double>(theta), 1, dim) : std::vector<double>();
        const int half_dim = dim / 2;
-        const int pad_size               = half_dim - static_cast<int>(indices.size());
+        const int pad_size = half_dim - static_cast<int>(double_precision ? indices_d.size() : indices.size());
        std::vector<std::vector<float>> freqs(static_cast<size_t>(seq_len), std::vector<float>(half_dim, 0.f));
        for (int64_t pos = 0; pos < seq_len; pos++) {
@ -181,20 +247,39 @@ namespace LTXV {
                freqs[static_cast<size_t>(pos)][out_idx++] = 0.f;
            }
            if (double_precision) {
                double coord = static_cast<double>(pos) / static_cast<double>(positional_scale);
                for (double index : indices_d) {
                    freqs[static_cast<size_t>(pos)][out_idx++] = static_cast<float>(index * (coord * 2.0 - 1.0));
                }
            } else {
                float coord = static_cast<float>(pos) / positional_scale;
                for (float index : indices) {
                    freqs[static_cast<size_t>(pos)][out_idx++] = index * (coord * 2.f - 1.f);
                }
            }
        }
        if (num_heads > 1) {
            return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
        }
        return build_rope_matrix_from_frequencies(freqs, dim);
    }
    __STATIC_INLINE__ ggml_tensor* apply_hidden_rope(ggml_context* ctx,
                                                     ggml_tensor* x,
                                                     ggml_tensor* pe,
                                                     int64_t heads,
                                                     int64_t dim_head,
                                                     bool rope_interleaved) {
-        auto x4 = ggml_reshape_4d(ctx, x, x->ne[0], 1, x->ne[1], x->ne[2]);
+        GGML_ASSERT(x->ne[0] == heads * dim_head);
        auto x4 = ggml_reshape_4d(ctx, x, dim_head, heads, x->ne[1], x->ne[2]);
        if (pe != nullptr && pe->ne[3] == x->ne[1] * heads) {
            auto x_flat   = ggml_reshape_4d(ctx, x4, dim_head, 1, x->ne[1] * heads, x->ne[2]);
            auto out_flat = Rope::apply_rope(ctx, x_flat, pe, rope_interleaved);
            auto out4     = ggml_reshape_4d(ctx, out_flat, dim_head, heads, x->ne[1], x->ne[2]);
            return ggml_reshape_3d(ctx, out4, heads * dim_head, x->ne[1], x->ne[2]);
        }
        return Rope::apply_rope(ctx, x4, pe, rope_interleaved);
    }
@ -338,8 +423,8 @@ namespace LTXV {
                if (k_pe == nullptr) {
                    k_pe = pe;
                }
-                q = apply_hidden_rope(ctx->ggml_ctx, q, pe, rope_interleaved);
+                q = apply_hidden_rope(ctx->ggml_ctx, q, pe, heads, dim_head, rope_interleaved);
-                k = apply_hidden_rope(ctx->ggml_ctx, k, k_pe, rope_interleaved);
+                k = apply_hidden_rope(ctx->ggml_ctx, k, k_pe, heads, dim_head, rope_interleaved);
            }
            auto out = ggml_ext_attention_ext(ctx->ggml_ctx,
@ -415,7 +500,7 @@ namespace LTXV {
            s             = ggml_repeat(ctx->ggml_ctx, s, e);
            t             = ggml_repeat(ctx->ggml_ctx, t, e);
            auto out      = ggml_add(ctx->ggml_ctx, s, t);
-            return ggml_ext_chunk(ctx->ggml_ctx, out, coeff, 1);
+            return ggml_ext_chunk(ctx->ggml_ctx, out, static_cast<int>(coeff), 1);
        }
        std::vector<ggml_tensor*> get_prompt_scale_shift_values(GGMLRunnerContext* ctx,
@ -609,7 +694,7 @@ namespace LTXV {
        float positional_embedding_theta              = 10000.f;
        std::vector<int> positional_embedding_max_pos = {20, 2048, 2048};
        std::tuple<int, int, int> vae_scale_factors   = {8, 32, 32};
-        bool causal_temporal_positioning              = false;
+        bool causal_temporal_positioning              = true;
        float timestep_scale_multiplier               = 1000.f;
        int64_t audio_in_channels                           = 128;
@ -641,11 +726,14 @@ namespace LTXV {
        bool audio_connector_rope_interleaved      = false;
        bool audio_connector_apply_gated_attention = false;
-        bool video_rope_interleaved = true;
+        bool video_rope_interleaved  = false;
        bool use_middle_indices_grid = true;
        bool cross_attention_adaln   = false;
        bool use_caption_projection          = true;
        bool use_audio_caption_projection    = true;
        bool caption_proj_before_connector   = true;
        bool caption_projection_first_linear = false;
        bool self_attention_gated  = false;
        bool cross_attention_gated = false;
@ -670,11 +758,16 @@ namespace LTXV {
    __STATIC_INLINE__ std::vector<float> build_1d_rope_matrix_from_coords(const std::vector<float>& coords,
                                                                          int dim,
                                                                          int num_heads         = 1,
                                                                          float theta           = 10000.f,
-                                                                          float max_pos = 20.f) {
+                                                                          float max_pos         = 20.f,
-        const std::vector<float> indices = generate_freq_grid(theta, 1, dim);
+                                                                          bool double_precision = false) {
        GGML_ASSERT(dim % num_heads == 0);
        const std::vector<float> indices = double_precision ? std::vector<float>() : generate_freq_grid(theta, 1, dim);
        const std::vector<double> indices_d =
            double_precision ? generate_freq_grid_double(static_cast<double>(theta), 1, dim) : std::vector<double>();
        const int half_dim = dim / 2;
-        const int pad_size               = half_dim - static_cast<int>(indices.size());
+        const int pad_size = half_dim - static_cast<int>(double_precision ? indices_d.size() : indices.size());
        std::vector<std::vector<float>> freqs(coords.size(), std::vector<float>(half_dim, 0.f));
        for (size_t pos = 0; pos < coords.size(); pos++) {
@ -682,11 +775,21 @@ namespace LTXV {
            for (int i = 0; i < pad_size; i++) {
                freqs[pos][out_idx++] = 0.f;
            }
            if (double_precision) {
                double coord = static_cast<double>(coords[pos]) / static_cast<double>(max_pos);
                for (double index : indices_d) {
                    freqs[pos][out_idx++] = static_cast<float>(index * (coord * 2.0 - 1.0));
                }
            } else {
                float coord = coords[pos] / max_pos;
                for (float index : indices) {
                    freqs[pos][out_idx++] = index * (coord * 2.f - 1.f);
                }
            }
        }
        if (num_heads > 1) {
            return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
        }
        return build_rope_matrix_from_frequencies(freqs, dim);
    }
@ -705,6 +808,7 @@ namespace LTXV {
                                                                          int64_t height,
                                                                          int64_t frames,
                                                                          int dim,
                                                                          int num_heads,
                                                                          float frame_rate,
                                                                          float theta,
                                                                          int max_pos_t,
@ -725,7 +829,7 @@ namespace LTXV {
                }
            }
        }
-        return build_1d_rope_matrix_from_coords(coords, dim, theta, static_cast<float>(max_pos_t));
+        return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
    }
    __STATIC_INLINE__ float audio_latent_start_time_sec(int64_t latent_index,
@ -742,6 +846,7 @@ namespace LTXV {
    __STATIC_INLINE__ std::vector<float> build_audio_rope_matrix(int64_t seq_len,
                                                                 int dim,
                                                                 int num_heads,
                                                                 float theta                  = 10000.f,
                                                                 int max_pos_t                = 20,
                                                                 bool use_middle_indices_grid = false) {
@ -755,7 +860,7 @@ namespace LTXV {
                coords[static_cast<size_t>(t)] = start;
            }
        }
-        return build_1d_rope_matrix_from_coords(coords, dim, theta, static_cast<float>(max_pos_t));
+        return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
    }
    struct BasicAVTransformerBlock : public GGMLBlock {
@ -825,7 +930,7 @@ namespace LTXV {
            t        = ggml_repeat(ctx->ggml_ctx, t, e);
            s        = ggml_repeat(ctx->ggml_ctx, s, e);
            auto out = ggml_add(ctx->ggml_ctx, s, t);
-            auto chunks = ggml_ext_chunk(ctx->ggml_ctx, out, coeff, 1);
+            auto chunks = ggml_ext_chunk(ctx->ggml_ctx, out, static_cast<int>(coeff), 1);
            return std::vector<ggml_tensor*>(chunks.begin() + start, chunks.begin() + start + count);
        }
@ -1004,11 +1109,23 @@ namespace LTXV {
            blocks["av_ca_v2a_gate_adaln_single"]          = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, 1);
            if (cfg.use_caption_projection) {
                if (cfg.caption_proj_before_connector) {
                    if (cfg.caption_projection_first_linear) {
                        blocks["caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.hidden_size);
                    }
                } else {
                    blocks["caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.hidden_size, cfg.hidden_size);
                }
            }
            if (cfg.use_audio_caption_projection) {
                if (cfg.caption_proj_before_connector) {
                    if (cfg.caption_projection_first_linear) {
                        blocks["audio_caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.audio_hidden_size);
                    }
                } else {
                    blocks["audio_caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.audio_hidden_size, cfg.audio_hidden_size);
                }
            }
            if (cfg.use_connector) {
                blocks["video_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(cfg.connector_hidden_size,
@ -1080,42 +1197,97 @@ namespace LTXV {
        std::pair<ggml_tensor*, ggml_tensor*> preprocess_contexts(GGMLRunnerContext* ctx,
                                                                  ggml_tensor* context,
                                                                  ggml_tensor* video_connector_pe,
-                                                                  ggml_tensor* audio_connector_pe) {
+                                                                  ggml_tensor* audio_connector_pe,
                                                                  bool process_audio_context) {
            if (context == nullptr) {
                return {nullptr, nullptr};
            }
-            if (context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim) {
+            bool is_fully_processed_context =
-                return {
+                context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
-                    ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim),
+                context->ne[1] >= 1024;
-                    ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim)
+            bool is_unprocessed_dual_context =
-                };
+                context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
                context->ne[1] < 1024;
            if (is_fully_processed_context) {
                auto v_context         = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
                ggml_tensor* a_context = nullptr;
                if (process_audio_context) {
                    a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
                }
                return {v_context, a_context};
            }
            ggml_tensor* v_context = context;
-            ggml_tensor* a_context = context;
+            ggml_tensor* a_context = process_audio_context ? context : nullptr;
-            if (context->ne[0] == cfg.caption_channels * 2) {
+            if (is_unprocessed_dual_context) {
                v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
                if (process_audio_context) {
                    a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
                }
            } else if (context->ne[0] == cfg.caption_channels * 2) {
                v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.caption_channels);
                if (process_audio_context) {
                    a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.caption_channels, cfg.caption_channels * 2);
                }
            }
            if (cfg.caption_proj_before_connector) {
                if (cfg.use_caption_projection &&
                    blocks.count("caption_projection") > 0 &&
                    v_context != nullptr &&
                    v_context->ne[0] == cfg.caption_channels) {
                    auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["caption_projection"]);
                    if (caption_projection != nullptr) {
                        v_context = caption_projection->forward(ctx, v_context);
                    }
                }
                if (process_audio_context &&
                    cfg.use_audio_caption_projection &&
                    blocks.count("audio_caption_projection") > 0 &&
                    a_context != nullptr &&
                    a_context->ne[0] == cfg.caption_channels) {
                    auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["audio_caption_projection"]);
                    if (caption_projection != nullptr) {
                        a_context = caption_projection->forward(ctx, a_context);
                    }
                }
            }
            if (cfg.use_connector && v_context != nullptr && v_context->ne[0] == cfg.connector_hidden_size) {
                auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["video_embeddings_connector"]);
                v_context      = connector->forward(ctx, v_context, video_connector_pe);
            }
-            if (cfg.use_audio_connector && a_context != nullptr && a_context->ne[0] == cfg.audio_connector_hidden_size) {
+            if (process_audio_context &&
                cfg.use_audio_connector &&
                a_context != nullptr &&
                a_context->ne[0] == cfg.audio_connector_hidden_size) {
                auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["audio_embeddings_connector"]);
                a_context      = connector->forward(ctx, a_context, audio_connector_pe);
            }
-            if (cfg.use_caption_projection && v_context != nullptr && v_context->ne[0] == cfg.caption_channels) {
+            if (!cfg.caption_proj_before_connector &&
                cfg.use_caption_projection &&
                blocks.count("caption_projection") > 0 &&
                v_context != nullptr &&
                v_context->ne[0] == cfg.caption_channels) {
                auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["caption_projection"]);
                if (caption_projection != nullptr) {
                    v_context = caption_projection->forward(ctx, v_context);
                }
-            if (cfg.use_audio_caption_projection && a_context != nullptr && a_context->ne[0] == cfg.caption_channels) {
+            }
            if (process_audio_context &&
                !cfg.caption_proj_before_connector &&
                cfg.use_audio_caption_projection &&
                blocks.count("audio_caption_projection") > 0 &&
                a_context != nullptr &&
                a_context->ne[0] == cfg.caption_channels) {
                auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["audio_caption_projection"]);
                if (caption_projection != nullptr) {
                    a_context = caption_projection->forward(ctx, a_context);
                }
            }
            return {v_context, a_context};
        }
@ -1168,9 +1340,13 @@ namespace LTXV {
                ax = nullptr;
            }
-            auto contexts = preprocess_contexts(ctx, context, video_connector_pe, audio_connector_pe);
+            bool run_ax    = ax != nullptr && ggml_nelements(ax) > 0 && audio_time > 0;
            auto contexts  = preprocess_contexts(ctx, context, video_connector_pe, audio_connector_pe, run_ax);
            auto v_context = contexts.first;
            auto a_context = contexts.second != nullptr ? contexts.second : contexts.first;
            if (contexts.second != nullptr) {
                a_context = ggml_cont(ctx->ggml_ctx, a_context);
            }
            auto v_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, timestep, cfg.timestep_scale_multiplier);
            auto v_pair            = adaln_single->forward(ctx, v_timestep_scaled);
@ -1257,6 +1433,8 @@ namespace LTXV {
        std::vector<float> audio_cross_pe_vec;
        std::vector<float> connector_pe_vec;
        std::vector<float> audio_connector_pe_vec;
        sd::Tensor<float> vx_input_cache;
        sd::Tensor<float> ax_input_cache;
        static int64_t infer_gate_heads(const String2TensorStorage& tensor_storage_map,
                                        const std::string& bias_name,
@ -1388,7 +1566,7 @@ namespace LTXV {
            model.get_param_tensors(tensors, prefix);
        }
-        std::pair<sd::Tensor<float>, sd::Tensor<float>> separate_audio_and_video_latents(const sd::Tensor<float>& x_tensor,
+        std::pair<sd::Tensor<float>, sd::Tensor<float>> split_av_latents(const sd::Tensor<float>& x_tensor,
                                                                         int audio_length) const {
            if (x_tensor.empty()) {
                return {{}, {}};
@ -1424,7 +1602,7 @@ namespace LTXV {
            return {vx, ax};
        }
-        ggml_tensor* recombine_audio_and_video_latents(ggml_context* ctx,
+        ggml_tensor* merge_av_latents(ggml_context* ctx,
                                      ggml_tensor* vx,
                                      ggml_tensor* ax) const {
            if (ax == nullptr || ggml_nelements(ax) == 0 || ax->ne[1] == 0) {
@ -1455,12 +1633,16 @@ namespace LTXV {
                                 const sd::Tensor<float>& audio_x_tensor         = {},
                                 const sd::Tensor<float>& audio_timesteps_tensor = {},
                                 int audio_length                                = 0) {
-            auto split_inputs = separate_audio_and_video_latents(x_tensor, audio_length);
+            auto split_inputs = split_av_latents(x_tensor, audio_length);
-            const sd::Tensor<float>& vx_tensor = split_inputs.first;
+            vx_input_cache    = split_inputs.first;
-            const sd::Tensor<float>& ax_tensor = !audio_x_tensor.empty() ? audio_x_tensor : split_inputs.second;
+            if (!audio_x_tensor.empty()) {
                ax_input_cache = audio_x_tensor;
            } else {
                ax_input_cache = split_inputs.second;
            }
-            ggml_tensor* vx         = make_input(vx_tensor);
+            ggml_tensor* vx         = make_input(vx_input_cache);
-            ggml_tensor* ax         = make_optional_input(ax_tensor);
+            ggml_tensor* ax         = make_optional_input(ax_input_cache);
            ggml_tensor* timesteps  = make_input(timesteps_tensor);
            ggml_tensor* a_timestep = make_optional_input(audio_timesteps_tensor);
            ggml_tensor* context    = make_optional_input(context_tensor);
@ -1471,12 +1653,15 @@ namespace LTXV {
                                                    vx->ne[1],
                                                    vx->ne[2],
                                                    static_cast<int>(params.hidden_size),
-                                                   25.f,
+                                                    static_cast<int>(params.num_attention_heads),
                                                    24.f,
                                                    params.positional_embedding_theta,
                                                    params.positional_embedding_max_pos,
                                                    params.vae_scale_factors,
-                                                   params.causal_temporal_positioning);
+                                                    params.causal_temporal_positioning,
-            auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.hidden_size / 2, vx->ne[0] * vx->ne[1] * vx->ne[2]);
+                                                    params.use_middle_indices_grid);
            auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.num_attention_heads);
            ggml_set_name(video_pe, "ltxav_video_pe");
            set_backend_tensor_data(video_pe, video_pe_vec.data());
            ggml_tensor* audio_pe       = nullptr;
@ -1485,10 +1670,12 @@ namespace LTXV {
            if (ax != nullptr && ggml_nelements(ax) > 0 && ax->ne[1] > 0) {
                audio_pe_vec = build_audio_rope_matrix(ax->ne[1],
                                                       static_cast<int>(params.audio_hidden_size),
                                                       static_cast<int>(params.audio_num_attention_heads),
                                                       params.positional_embedding_theta,
                                                       params.audio_positional_embedding_max_pos[0],
-                                                       false);
+                                                       params.use_middle_indices_grid);
-                audio_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_hidden_size / 2, ax->ne[1]);
+                audio_pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
                ggml_set_name(audio_pe, "ltxav_audio_pe");
                set_backend_tensor_data(audio_pe, audio_pe_vec.data());
                int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]);
@ -1496,43 +1683,68 @@ namespace LTXV {
                                                                        vx->ne[1],
                                                                        vx->ne[2],
                                                                        static_cast<int>(params.audio_cross_attention_dim),
                                                                        static_cast<int>(params.audio_num_attention_heads),
                                                                        25.f,
                                                                        params.positional_embedding_theta,
                                                                        temporal_max_pos,
                                                                        std::get<0>(params.vae_scale_factors),
                                                                        params.causal_temporal_positioning,
                                                                        true);
-                video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_cross_attention_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2]);
+                video_cross_pe       = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.audio_num_attention_heads);
                ggml_set_name(video_cross_pe, "ltxav_video_cross_pe");
                set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data());
                audio_cross_pe_vec = build_audio_rope_matrix(ax->ne[1],
                                                             static_cast<int>(params.audio_cross_attention_dim),
                                                             static_cast<int>(params.audio_num_attention_heads),
                                                             params.positional_embedding_theta,
                                                             temporal_max_pos,
                                                             true);
-                audio_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_cross_attention_dim / 2, ax->ne[1]);
+                audio_cross_pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
                ggml_set_name(audio_cross_pe, "ltxav_audio_cross_pe");
                set_backend_tensor_data(audio_cross_pe, audio_cross_pe_vec.data());
            }
            bool needs_video_connector_pe =
                params.use_connector &&
                context != nullptr &&
                (context->ne[0] == params.connector_hidden_size ||
                 ((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
                   context->ne[0] == params.caption_channels * 2) &&
                  context->ne[1] < 1024));
            ggml_tensor* video_connector_pe = nullptr;
-            if (params.use_connector && context != nullptr && context->ne[0] == params.connector_hidden_size) {
+            if (needs_video_connector_pe) {
                int64_t seq_len      = context->ne[1];
                int64_t target_len   = std::max<int64_t>(1024, seq_len);
                int64_t duplications = (target_len + params.connector_num_registers - 1) / params.connector_num_registers;
                int64_t full_len     = seq_len + duplications * params.connector_num_registers - seq_len;
-                connector_pe_vec     = build_1d_rope_matrix(full_len, static_cast<int>(params.connector_hidden_size));
+                connector_pe_vec     = build_1d_rope_matrix(full_len, static_cast<int>(params.connector_hidden_size), static_cast<int>(params.connector_num_heads), 10000.f, 4096.f, true);
-                video_connector_pe   = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.connector_hidden_size / 2, full_len);
+                video_connector_pe   = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.connector_head_dim / 2, full_len * params.connector_num_heads);
                ggml_set_name(video_connector_pe, "ltxav_video_connector_pe");
                set_backend_tensor_data(video_connector_pe, connector_pe_vec.data());
            }
            bool run_audio_context =
                ax != nullptr &&
                ggml_nelements(ax) > 0 &&
                ax->ne[1] > 0;
            bool needs_audio_connector_pe =
                run_audio_context &&
                params.use_audio_connector &&
                context != nullptr &&
                (context->ne[0] == params.audio_connector_hidden_size ||
                 ((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
                   context->ne[0] == params.caption_channels * 2) &&
                  context->ne[1] < 1024));
            ggml_tensor* audio_connector_pe = nullptr;
-            if (params.use_audio_connector && context != nullptr && context->ne[0] == params.audio_connector_hidden_size) {
+            if (needs_audio_connector_pe) {
                int64_t seq_len        = context->ne[1];
                int64_t target_len     = std::max<int64_t>(1024, seq_len);
                int64_t duplications   = (target_len + params.audio_connector_num_registers - 1) / params.audio_connector_num_registers;
                int64_t full_len       = seq_len + duplications * params.audio_connector_num_registers - seq_len;
-                audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.audio_connector_hidden_size));
+                audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.audio_connector_hidden_size), static_cast<int>(params.audio_connector_num_heads), 10000.f, 4096.f, true);
-                audio_connector_pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_connector_hidden_size / 2, full_len);
+                audio_connector_pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_connector_head_dim / 2, full_len * params.audio_connector_num_heads);
                ggml_set_name(audio_connector_pe, "ltxav_audio_connector_pe");
                set_backend_tensor_data(audio_connector_pe, audio_connector_pe_vec.data());
            }
@ -1549,7 +1761,7 @@ namespace LTXV {
                                            audio_cross_pe,
                                            video_connector_pe,
                                            audio_connector_pe);
-            auto out = recombine_audio_and_video_latents(compute_ctx, out_pair.first, out_pair.second);
+            auto out        = merge_av_latents(compute_ctx, out_pair.first, out_pair.second);
            ggml_build_forward_expand(gf, out);
            return gf;
        }
@ -1564,7 +1776,106 @@ namespace LTXV {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length);
            };
-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
+            auto out = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
            return out;
        }
        void test(const std::string& x_path,
                  const std::string& timesteps_path       = "",
                  const std::string& context_path         = "",
                  const std::string& audio_x_path         = "",
                  const std::string& audio_timesteps_path = "") {
            auto x = sd::load_tensor_from_file_as_tensor<float>(x_path);
            GGML_ASSERT(!x.empty());
            print_sd_tensor(x, false, "ltxav_x");
            sd::Tensor<float> timesteps;
            if (!timesteps_path.empty()) {
                timesteps = sd::load_tensor_from_file_as_tensor<float>(timesteps_path);
            } else {
                timesteps = sd::Tensor<float>::from_vector(std::vector<float>{1.f});
            }
            GGML_ASSERT(!timesteps.empty());
            print_sd_tensor(timesteps, false, "ltxav_timesteps");
            sd::Tensor<float> context;
            if (!context_path.empty()) {
                context = sd::load_tensor_from_file_as_tensor<float>(context_path);
                GGML_ASSERT(!context.empty());
                print_sd_tensor(context, false, "ltxav_context");
            }
            sd::Tensor<float> audio_x;
            int audio_length = 0;
            if (!audio_x_path.empty()) {
                audio_x = sd::load_tensor_from_file_as_tensor<float>(audio_x_path);
                GGML_ASSERT(!audio_x.empty());
                GGML_ASSERT(audio_x.dim() >= 2);
                audio_length = static_cast<int>(audio_x.shape()[1]);
                print_sd_tensor(audio_x, false, "ltxav_audio_x");
            }
            sd::Tensor<float> audio_timesteps;
            if (!audio_timesteps_path.empty()) {
                audio_timesteps = sd::load_tensor_from_file_as_tensor<float>(audio_timesteps_path);
                GGML_ASSERT(!audio_timesteps.empty());
            } else if (!audio_x.empty()) {
                audio_timesteps = timesteps;
            }
            if (!audio_timesteps.empty()) {
                print_sd_tensor(audio_timesteps, false, "ltxav_audio_timesteps");
            }
            int64_t t0   = ggml_time_ms();
            auto out_opt = compute(8, x, timesteps, context, audio_x, audio_timesteps, audio_length);
            int64_t t1   = ggml_time_ms();
            GGML_ASSERT(!out_opt.empty());
            print_sd_tensor(out_opt, false, "ltxav_out");
            LOG_DEBUG("ltxav test done in %lldms", t1 - t0);
        }
        static void load_from_file_and_test(const std::string& model_path,
                                            const std::string& x_path,
                                            const std::string& timesteps_path       = "",
                                            const std::string& context_path         = "",
                                            const std::string& embeddings_path      = "",
                                            const std::string& audio_x_path         = "",
                                            const std::string& audio_timesteps_path = "") {
            // ggml_backend_t backend = ggml_backend_cuda_init(0);
            ggml_backend_t backend = ggml_backend_cpu_init();
            LOG_INFO("loading ltxav from '%s'", model_path.c_str());
            ModelLoader model_loader;
            if (!model_loader.init_from_file_and_convert_name(model_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
                return;
            }
            if (!embeddings_path.empty()) {
                LOG_INFO("loading ltxav embeddings from '%s'", embeddings_path.c_str());
                if (!model_loader.init_from_file(embeddings_path)) {
                    LOG_ERROR("init embeddings model loader from file failed: '%s'", embeddings_path.c_str());
                    return;
                }
            }
            auto& tensor_storage_map           = model_loader.get_tensor_storage_map();
            std::shared_ptr<LTXAVRunner> ltxav = std::make_shared<LTXAVRunner>(backend,
                                                                               false,
                                                                               tensor_storage_map,
                                                                               "model.diffusion_model");
            ltxav->alloc_params_buffer();
            std::map<std::string, ggml_tensor*> tensors;
            ltxav->get_param_tensors(tensors, "model.diffusion_model");
            if (!model_loader.load_tensors(tensors)) {
                LOG_ERROR("load tensors from model loader failed");
                return;
            }
            LOG_INFO("ltxav model loaded");
            ltxav->test(x_path, timesteps_path, context_path, audio_x_path, audio_timesteps_path);
        }
    };
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -14,7 +14,7 @@
 #include "diffusion_model.hpp"
 #include "esrgan.hpp"
 #include "lora.hpp"
-#include "ltx_vae.h"
+#include "ltx_vae.hpp"
 #include "pmid.hpp"
 #include "sample-cache.h"
 #include "tae.hpp"
@ -3742,8 +3742,8 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
    }
    if (sd_version_is_ltxav(sd_ctx->sd->version)) {
-        latents.audio_length = get_ltxav_num_audio_latents(request->frames, request->fps);
+        latents.audio_length = 0;
-        latents.audio_latent = sd::zeros<float>({16, latents.audio_length, 8, 1});
+        latents.audio_latent = {};
    }
    if (sd_version_is_ltxav(sd_ctx->sd->version)) {
@ -3923,9 +3923,8 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
        latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
    }
-    if (!latents.audio_latent.empty()) {
+    // Pipeline-level audio support is temporarily disabled. Keep the model-side
-        latents.init_latent = pack_ltxav_audio_and_video_latents(latents.init_latent, latents.audio_latent);
+    // AV implementation intact, but feed pure video latents through vid_gen.
    }
    return latents;
 }
--- a/src/tokenizers/vocab/gemma_merges.hpp
+++ b/src/tokenizers/vocab/gemma_merges.hpp
--- a/src/tokenizers/vocab/gemma_vocab.hpp
+++ b/src/tokenizers/vocab/gemma_vocab.hpp
Author	SHA1	Message	Date
leejet	2ca782a65a	fix ubuntu build	2026-04-29 01:30:04 +08:00
leejet	d51f35bf63	fix ci	2026-04-29 01:23:20 +08:00
leejet	0b65927b1b	change vocab file encoding	2026-04-29 01:17:45 +08:00
leejet	831b321c6a	add basic ltx2.3 support	2026-04-29 01:11:15 +08:00