wip

2026-05-08 16:28:53 +00:00 · 2026-04-27 21:43:22 +08:00 · 2026-04-27 21:43:22 +08:00 · ca7e008d78
commit ca7e008d78
parent 274ecd5d41
19 changed files with 3415 additions and 141 deletions
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -20,6 +20,7 @@
 #include "common/resource_owners.hpp"
 #include "image_metadata.h"
 #include "llm.hpp"
+#include "ltx_vae_test.h"

 namespace fs = std::filesystem;

@ -503,12 +504,24 @@ int main(int argc, const char* argv[]) {

    cli_params.verbose = true;
    sd_set_log_callback(sd_log_cb, (void*)&cli_params);
-    GemmaTokenizer tokenizer;
-    auto tokens = tokenizer.tokenize("<html> 一只可爱的小猫");
-    for (auto token : tokens) {
-        LOG_INFO("%d", token);
-    }
+    {
+        const bool run_ltx_vae_test  = false;
+        const std::string model_path = "E:/Code/ComfyUI/models/vae/ltx-2.3-22b-dev_video_vae.safetensors";
+        const std::string input_path = "E:/Code/sd.cpp/build/ltx_vae_z.bin";
+        if (run_ltx_vae_test) {
+            ltx_vae_load_from_file_and_test(model_path, input_path);
            return 0;
+        }
+    }
+
+    // cli_params.verbose = true;
+    // sd_set_log_callback(sd_log_cb, (void*)&cli_params);
+    // GemmaTokenizer tokenizer;
+    // auto tokens = tokenizer.tokenize("<html> 一只可爱的小猫");
+    // for (auto token : tokens) {
+    //     LOG_INFO("%d", token);
+    // }
+    // return 0;

    parse_args(argc, argv, cli_params, ctx_params, gen_params);
    sd_set_log_callback(sd_log_cb, (void*)&cli_params);
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@ -340,6 +340,10 @@ ArgOptions SDContextParams::get_options() {
         "--high-noise-diffusion-model",
         "path to the standalone high noise diffusion model",
         &high_noise_diffusion_model_path},
+        {"",
+         "--embeddings-connectors",
+         "path to LTXAV embeddings connectors",
+         &embeddings_connectors_path},
        {"",
         "--vae",
         "path to standalone vae model",
@ -656,6 +660,7 @@ std::string SDContextParams::to_string() const {
        << "  llm_vision_path: \"" << llm_vision_path << "\",\n"
        << "  diffusion_model_path: \"" << diffusion_model_path << "\",\n"
        << "  high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
+        << "  embeddings_connectors_path: \"" << embeddings_connectors_path << "\",\n"
        << "  vae_path: \"" << vae_path << "\",\n"
        << "  taesd_path: \"" << taesd_path << "\",\n"
        << "  esrgan_path: \"" << esrgan_path << "\",\n"
@ -712,6 +717,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
        llm_vision_path.c_str(),
        diffusion_model_path.c_str(),
        high_noise_diffusion_model_path.c_str(),
+        embeddings_connectors_path.c_str(),
        vae_path.c_str(),
        taesd_path.c_str(),
        control_net_path.c_str(),
@ -2180,6 +2186,7 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
    params.strength                 = strength;
    params.seed                     = seed;
    params.video_frames             = video_frames;
+    params.fps                      = fps;
    params.vace_strength            = vace_strength;
    params.vae_tiling_params        = vae_tiling_params;
    params.cache                    = cache_params;
--- a/examples/common/common.h
+++ b/examples/common/common.h
@ -92,6 +92,7 @@ struct SDContextParams {
    std::string llm_vision_path;
    std::string diffusion_model_path;
    std::string high_noise_diffusion_model_path;
+    std::string embeddings_connectors_path;
    std::string vae_path;
    std::string taesd_path;
    std::string esrgan_path;
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -171,6 +171,7 @@ typedef struct {
    const char* llm_vision_path;
    const char* diffusion_model_path;
    const char* high_noise_diffusion_model_path;
+    const char* embeddings_connectors_path;
    const char* vae_path;
    const char* taesd_path;
    const char* control_net_path;
@ -359,6 +360,7 @@ typedef struct {
    float strength;
    int64_t seed;
    int video_frames;
+    int fps;
    float vace_strength;
    sd_tiling_params_t vae_tiling_params;
    sd_cache_params_t cache;
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@ -1,6 +1,8 @@
 #ifndef __CONDITIONER_HPP__
 #define __CONDITIONER_HPP__

+#include <cmath>
+#include <limits>
 #include <optional>

 #include "clip.hpp"
@ -46,6 +48,17 @@ static inline sd::Tensor<float> apply_token_weights(sd::Tensor<float> hidden_sta
        return hidden_states;
    }

+    bool all_one = true;
+    for (float weight : weights) {
+        if (weight != 1.0f) {
+            all_one = false;
+            break;
+        }
+    }
+    if (all_one) {
+        return hidden_states;
+    }
+
    if (hidden_states.dim() == 1) {
        hidden_states.unsqueeze_(1);
    }
@ -57,7 +70,7 @@ static inline sd::Tensor<float> apply_token_weights(sd::Tensor<float> hidden_sta
    chunk_weights.reshape_({1, static_cast<int64_t>(weights.size())});
    hidden_states *= chunk_weights;
    float new_mean = hidden_states.mean();
-    if (new_mean != 0.0f) {
+    if (std::isfinite(original_mean) && std::isfinite(new_mean) && new_mean != 0.0f) {
        hidden_states *= (original_mean / new_mean);
    }

@ -1958,4 +1971,277 @@ struct LLMEmbedder : public Conditioner {
    }
 };

+struct LTXAVTextProjection : public GGMLBlock {
+    static constexpr int64_t kHiddenSize = 3840;
+    static constexpr int64_t kNumStates  = 49;
+    bool dual_projection                 = false;
+
+    LTXAVTextProjection(bool dual_projection = false)
+        : dual_projection(dual_projection) {
+        if (dual_projection) {
+            blocks["video_aggregate_embed"] = std::make_shared<Linear>(kHiddenSize * kNumStates, 4096, true);
+            blocks["audio_aggregate_embed"] = std::make_shared<Linear>(kHiddenSize * kNumStates, 2048, true);
+        } else {
+            blocks["projection"] = std::make_shared<Linear>(kHiddenSize * kNumStates, kHiddenSize, false);
+        }
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+        if (!dual_projection) {
+            auto projection = std::dynamic_pointer_cast<Linear>(blocks["projection"]);
+            return projection->forward(ctx, x);
+        }
+
+        auto video_projection = std::dynamic_pointer_cast<Linear>(blocks["video_aggregate_embed"]);
+        auto audio_projection = std::dynamic_pointer_cast<Linear>(blocks["audio_aggregate_embed"]);
+        auto video_in         = ggml_ext_scale(ctx->ggml_ctx, x, std::sqrt(4096.f / static_cast<float>(kHiddenSize)));
+        auto audio_in         = ggml_ext_scale(ctx->ggml_ctx, x, std::sqrt(2048.f / static_cast<float>(kHiddenSize)));
+        auto video            = video_projection->forward(ctx, video_in);
+        auto audio            = audio_projection->forward(ctx, audio_in);
+        return ggml_concat(ctx->ggml_ctx, video, audio, 0);
+    }
+};
+
+struct LTXAVTextProjectionRunner : public GGMLRunner {
+    LTXAVTextProjection model;
+
+    LTXAVTextProjectionRunner(ggml_backend_t backend,
+                              bool offload_params_to_cpu,
+                              const String2TensorStorage& tensor_storage_map = {},
+                              const std::string& prefix                      = "")
+        : GGMLRunner(backend, offload_params_to_cpu),
+          model(tensor_storage_map.find(prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end()) {
+        model.init(params_ctx, tensor_storage_map, prefix);
+    }
+
+    std::string get_desc() override {
+        return "ltxav_text_projection";
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
+        ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+        auto x          = make_input(x_tensor);
+        auto runner_ctx = get_context();
+        auto out        = model.forward(&runner_ctx, x);
+        ggml_build_forward_expand(gf, out);
+        return gf;
+    }
+
+    sd::Tensor<float> compute(int n_threads, const sd::Tensor<float>& x) {
+        auto get_graph = [&]() -> ggml_cgraph* {
+            return build_graph(x);
+        };
+        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
+    }
+};
+
+struct LTXAVEmbedder : public Conditioner {
+    static constexpr int64_t kHiddenSize = 3840;
+    static constexpr int64_t kNumStates  = 49;
+    static constexpr int64_t kMinLength  = 1024;
+
+    std::shared_ptr<GemmaTokenizer> tokenizer;
+    std::shared_ptr<LLM::LLMRunner> llm;
+    std::shared_ptr<LTXAVTextProjectionRunner> projector;
+    bool dual_projection = false;
+
+    LTXAVEmbedder(ggml_backend_t backend,
+                  bool offload_params_to_cpu,
+                  const String2TensorStorage& tensor_storage_map = {},
+                  const std::string& llm_prefix                  = "text_encoders.llm",
+                  const std::string& projector_prefix            = "text_embedding_projection") {
+        tokenizer       = std::make_shared<GemmaTokenizer>();
+        llm             = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::GEMMA3_12B,
+                                               backend,
+                                               offload_params_to_cpu,
+                                               tensor_storage_map,
+                                               llm_prefix,
+                                               false);
+        dual_projection = tensor_storage_map.find(projector_prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end();
+        projector       = std::make_shared<LTXAVTextProjectionRunner>(backend,
+                                                                offload_params_to_cpu,
+                                                                tensor_storage_map,
+                                                                projector_prefix);
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
+        llm->get_param_tensors(tensors, "text_encoders.llm");
+        projector->get_param_tensors(tensors, "text_embedding_projection");
+    }
+
+    void alloc_params_buffer() override {
+        llm->alloc_params_buffer();
+        projector->alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        llm->free_params_buffer();
+        projector->free_params_buffer();
+    }
+
+    size_t get_params_buffer_size() override {
+        return llm->get_params_buffer_size() + projector->get_params_buffer_size();
+    }
+
+    void set_flash_attention_enabled(bool enabled) override {
+        llm->set_flash_attention_enabled(enabled);
+        projector->set_flash_attention_enabled(enabled);
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        llm->set_weight_adapter(adapter);
+        projector->set_weight_adapter(adapter);
+    }
+
+    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
+                                                                                  const std::pair<int, int>& attn_range) {
+        std::vector<std::pair<std::string, float>> parsed_attention;
+        if (attn_range.first >= 0 && attn_range.second > 0) {
+            if (attn_range.first > 0) {
+                parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
+            }
+            if (attn_range.second - attn_range.first > 0) {
+                auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
+                parsed_attention.insert(parsed_attention.end(), new_parsed_attention.begin(), new_parsed_attention.end());
+            }
+            if (static_cast<size_t>(attn_range.second) < text.size()) {
+                parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
+            }
+        } else {
+            parsed_attention.emplace_back(text, 1.f);
+        }
+
+        std::vector<int> tokens;
+        std::vector<float> weights;
+        for (const auto& item : parsed_attention) {
+            auto curr_tokens = tokenizer->encode(item.first, nullptr);
+            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            weights.insert(weights.end(), curr_tokens.size(), item.second);
+        }
+
+        std::vector<float> mask;
+        tokenizer->pad_tokens(tokens, &weights, &mask, kMinLength);
+        return {tokens, weights, mask};
+    }
+
+    sd::Tensor<float> encode_prompt(int n_threads,
+                                    const std::string& prompt,
+                                    const std::pair<int, int>& prompt_attn_range) {
+        auto tokens_weights_mask = tokenize(prompt, prompt_attn_range);
+        auto& tokens             = std::get<0>(tokens_weights_mask);
+        auto& weights            = std::get<1>(tokens_weights_mask);
+        auto& mask               = std::get<2>(tokens_weights_mask);
+
+        sd::Tensor<int32_t> input_ids({static_cast<int64_t>(tokens.size())}, std::vector<int32_t>(tokens.begin(), tokens.end()));
+        sd::Tensor<float> attention_mask;
+        if (!mask.empty()) {
+            const float mask_min = std::numeric_limits<float>::lowest() / 4.0f;
+            attention_mask       = sd::Tensor<float>({static_cast<int64_t>(mask.size()), static_cast<int64_t>(mask.size())});
+            for (size_t i1 = 0; i1 < mask.size(); ++i1) {
+                for (size_t i0 = 0; i0 < mask.size(); ++i0) {
+                    float value = 0.0f;
+                    if (mask[i0] == 0.0f) {
+                        value += mask_min;
+                    }
+                    if (i0 > i1) {
+                        value += mask_min;
+                    }
+                    attention_mask[static_cast<int64_t>(i0 + mask.size() * i1)] = value;
+                }
+            }
+        }
+
+        auto hidden_states = llm->compute(n_threads,
+                                          input_ids,
+                                          attention_mask,
+                                          {},
+                                          {},
+                                          true);
+        GGML_ASSERT(!hidden_states.empty());
+        hidden_states = apply_token_weights(std::move(hidden_states), weights);
+
+        int64_t valid_tokens = 0;
+        for (float value : mask) {
+            valid_tokens += static_cast<int64_t>(value > 0.0f);
+        }
+        GGML_ASSERT(valid_tokens > 0);
+
+        hidden_states = sd::ops::slice(hidden_states,
+                                       1,
+                                       hidden_states.shape()[1] - valid_tokens,
+                                       hidden_states.shape()[1]);
+        hidden_states.reshape_({kHiddenSize, kNumStates, valid_tokens});
+        hidden_states = hidden_states.permute({1, 0, 2});
+
+        if (dual_projection) {
+            for (int64_t state_idx = 0; state_idx < kNumStates; ++state_idx) {
+                for (int64_t token_idx = 0; token_idx < valid_tokens; ++token_idx) {
+                    double sq_sum = 0.0;
+                    for (int64_t hidden_idx = 0; hidden_idx < kHiddenSize; ++hidden_idx) {
+                        float value = hidden_states.index(state_idx, hidden_idx, token_idx);
+                        sq_sum += static_cast<double>(value) * static_cast<double>(value);
+                    }
+
+                    float inv_rms = 1.0f / std::sqrt(static_cast<float>(sq_sum / static_cast<double>(kHiddenSize)) + 1e-6f);
+                    for (int64_t hidden_idx = 0; hidden_idx < kHiddenSize; ++hidden_idx) {
+                        hidden_states.index(state_idx, hidden_idx, token_idx) *= inv_rms;
+                    }
+                }
+            }
+        } else {
+            for (int64_t state_idx = 0; state_idx < kNumStates; ++state_idx) {
+                double sum      = 0.0;
+                float min_value = std::numeric_limits<float>::infinity();
+                float max_value = -std::numeric_limits<float>::infinity();
+                for (int64_t token_idx = 0; token_idx < valid_tokens; ++token_idx) {
+                    for (int64_t hidden_idx = 0; hidden_idx < kHiddenSize; ++hidden_idx) {
+                        float value = hidden_states.index(state_idx, hidden_idx, token_idx);
+                        sum += value;
+                        min_value = std::min(min_value, value);
+                        max_value = std::max(max_value, value);
+                    }
+                }
+
+                float mean_value  = static_cast<float>(sum / static_cast<double>(kHiddenSize * valid_tokens));
+                float denom       = max_value - min_value + 1e-6f;
+                float scale_value = 8.0f / denom;
+                for (int64_t token_idx = 0; token_idx < valid_tokens; ++token_idx) {
+                    for (int64_t hidden_idx = 0; hidden_idx < kHiddenSize; ++hidden_idx) {
+                        float value                                           = hidden_states.index(state_idx, hidden_idx, token_idx);
+                        hidden_states.index(state_idx, hidden_idx, token_idx) = (value - mean_value) * scale_value;
+                    }
+                }
+            }
+        }
+
+        hidden_states.reshape_({kNumStates * kHiddenSize, valid_tokens});
+        return projector->compute(n_threads, hidden_states);
+    }
+
+    SDCondition get_learned_condition(int n_threads,
+                                      const ConditionerParams& conditioner_params) override {
+        int64_t t0 = ggml_time_ms();
+
+        std::string prompt;
+        std::pair<int, int> prompt_attn_range;
+        prompt_attn_range.first = static_cast<int>(prompt.size());
+        prompt += conditioner_params.text;
+        prompt_attn_range.second = static_cast<int>(prompt.size());
+
+        auto hidden_states = encode_prompt(n_threads, prompt, prompt_attn_range);
+        GGML_ASSERT(!hidden_states.empty());
+
+        int64_t t1 = ggml_time_ms();
+        LOG_DEBUG("computing LTXAV condition graph completed, taking %" PRId64 " ms", t1 - t0);
+
+        SDCondition result;
+        result.c_crossattn = std::move(hidden_states);
+        return result;
+    }
+};
+
 #endif
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@ -5,6 +5,7 @@
 #include "anima.hpp"
 #include "ernie_image.hpp"
 #include "flux.hpp"
+#include "ltxv.hpp"
 #include "mmdit.hpp"
 #include "qwen_image.hpp"
 #include "tensor_ggml.hpp"
@ -14,7 +15,9 @@

 struct DiffusionParams {
    const sd::Tensor<float>* x                        = nullptr;
+    const sd::Tensor<float>* audio_x                  = nullptr;
    const sd::Tensor<float>* timesteps                = nullptr;
+    const sd::Tensor<float>* audio_timesteps          = nullptr;
    const sd::Tensor<float>* context                  = nullptr;
    const sd::Tensor<float>* c_concat                 = nullptr;
    const sd::Tensor<float>* y                        = nullptr;
@ -28,6 +31,7 @@ struct DiffusionParams {
    float control_strength                            = 0.f;
    const sd::Tensor<float>* vace_context             = nullptr;
    float vace_strength                               = 1.f;
+    int audio_length                                  = 0;
    const std::vector<int>* skip_layers               = nullptr;
 };

@ -579,4 +583,69 @@ struct ErnieImageModel : public DiffusionModel {
    }
 };

+struct LTXAVModel : public DiffusionModel {
+    std::string prefix;
+    LTXV::LTXAVRunner ltxav;
+
+    LTXAVModel(ggml_backend_t backend,
+               bool offload_params_to_cpu,
+               const String2TensorStorage& tensor_storage_map = {},
+               const std::string prefix                       = "model.diffusion_model")
+        : prefix(prefix), ltxav(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
+    }
+
+    std::string get_desc() override {
+        return ltxav.get_desc();
+    }
+
+    void alloc_params_buffer() override {
+        ltxav.alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        ltxav.free_params_buffer();
+    }
+
+    void free_compute_buffer() override {
+        ltxav.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
+        ltxav.get_param_tensors(tensors, prefix);
+    }
+
+    size_t get_params_buffer_size() override {
+        return ltxav.get_params_buffer_size();
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        ltxav.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
+        return 0;
+    }
+
+    void set_flash_attention_enabled(bool enabled) override {
+        ltxav.set_flash_attention_enabled(enabled);
+    }
+
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        ltxav.set_circular_axes(circular_x, circular_y);
+    }
+
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        return ltxav.compute(n_threads,
+                             *diffusion_params.x,
+                             *diffusion_params.timesteps,
+                             tensor_or_empty(diffusion_params.context),
+                             tensor_or_empty(diffusion_params.audio_x),
+                             tensor_or_empty(diffusion_params.audio_timesteps),
+                             diffusion_params.audio_length);
+    }
+};
+
 #endif
--- a/src/llm.hpp
+++ b/src/llm.hpp
@ -2,8 +2,10 @@
 #define __LLM_HPP__

 #include <algorithm>
+#include <cmath>
 #include <fstream>
 #include <iostream>
+#include <limits>
 #include <map>
 #include <memory>
 #include <optional>
@ -30,6 +32,7 @@ namespace LLM {
        QWEN3,
        MISTRAL_SMALL_3_2,
        MINISTRAL_3_3B,
+        GEMMA3_12B,
        ARCH_COUNT,
    };

@ -38,6 +41,12 @@ namespace LLM {
        "qwen3",
        "mistral_small3.2",
        "ministral3.3b",
+        "gemma3_12b",
+    };
+
+    enum class MLPActivation {
+        SILU,
+        GELU_TANH,
    };

    struct LLMVisionParams {
@ -64,14 +73,62 @@ namespace LLM {
        int head_dim                    = 128;
        bool qkv_bias                   = true;
        bool qk_norm                    = false;
+        bool rms_norm_add               = false;
+        bool normalize_input            = false;
        int64_t vocab_size              = 152064;
+        int64_t max_position_embeddings = 128000;
        float rms_norm_eps              = 1e-06f;
+        MLPActivation mlp_activation    = MLPActivation::SILU;
+        std::vector<float> rope_thetas  = {1000000.f};
+        std::vector<float> rope_scales  = {1.f};
+        std::vector<int> sliding_attention;
        LLMVisionParams vision;
    };

-    struct MLP : public GGMLBlock {
+    struct LLMRMSNorm : public UnaryBlock {
+    protected:
+        int64_t hidden_size;
+        float eps;
+        bool add_unit_offset;
+        std::string prefix;
+
+        void init_params(ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         std::string prefix                             = "") override {
+            this->prefix     = prefix;
+            params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+        }
+
    public:
-        MLP(int64_t hidden_size, int64_t intermediate_size, bool bias = false) {
+        LLMRMSNorm(int64_t hidden_size,
+                   float eps            = 1e-06f,
+                   bool add_unit_offset = false)
+            : hidden_size(hidden_size), eps(eps), add_unit_offset(add_unit_offset) {}
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            ggml_tensor* w = params["weight"];
+            if (ctx->weight_adapter) {
+                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+            }
+            x           = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+            auto scaled = ggml_mul(ctx->ggml_ctx, x, w);
+            if (add_unit_offset) {
+                scaled = ggml_add_inplace(ctx->ggml_ctx, scaled, x);
+            }
+            return scaled;
+        }
+    };
+
+    struct MLP : public GGMLBlock {
+    protected:
+        MLPActivation activation;
+
+    public:
+        MLP(int64_t hidden_size,
+            int64_t intermediate_size,
+            bool bias                 = false,
+            MLPActivation activation_ = MLPActivation::SILU)
+            : activation(activation_) {
            blocks["gate_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, intermediate_size, bias));
            blocks["up_proj"]   = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, intermediate_size, bias));
            blocks["down_proj"] = std::shared_ptr<GGMLBlock>(new Linear(intermediate_size, hidden_size, bias));
@ -84,7 +141,11 @@ namespace LLM {
            auto down_proj = std::dynamic_pointer_cast<Linear>(blocks["down_proj"]);

            auto h = gate_proj->forward(ctx, x);
+            if (activation == MLPActivation::GELU_TANH) {
+                h = ggml_ext_gelu(ctx->ggml_ctx, h, true);
+            } else {
                h = ggml_silu_inplace(ctx->ggml_ctx, h);
+            }
            h = ggml_mul_inplace(ctx->ggml_ctx, h, up_proj->forward(ctx, x));
            h = down_proj->forward(ctx, h);
            return h;
@ -377,24 +438,35 @@ namespace LLM {
        int64_t num_heads;
        int64_t num_kv_heads;
        bool qk_norm;
+        int64_t max_position_embeddings;
+        std::vector<float> rope_thetas;
+        std::vector<float> rope_scales;

    public:
        Attention(const LLMParams& params)
-            : arch(params.arch), num_heads(params.num_heads), num_kv_heads(params.num_kv_heads), head_dim(params.head_dim), qk_norm(params.qk_norm) {
+            : arch(params.arch),
+              num_heads(params.num_heads),
+              num_kv_heads(params.num_kv_heads),
+              head_dim(params.head_dim),
+              qk_norm(params.qk_norm),
+              max_position_embeddings(params.max_position_embeddings),
+              rope_thetas(params.rope_thetas),
+              rope_scales(params.rope_scales) {
            blocks["q_proj"] = std::make_shared<Linear>(params.hidden_size, num_heads * head_dim, params.qkv_bias);
            blocks["k_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
            blocks["v_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
            blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, false);
            if (params.qk_norm) {
-                blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim, params.rms_norm_eps);
-                blocks["k_norm"] = std::make_shared<RMSNorm>(head_dim, params.rms_norm_eps);
+                blocks["q_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
+                blocks["k_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
            }
        }

        ggml_tensor* forward(GGMLRunnerContext* ctx,
                             ggml_tensor* x,
                             ggml_tensor* input_pos,
-                             ggml_tensor* attention_mask = nullptr) {
+                             ggml_tensor* attention_mask = nullptr,
+                             int rope_index              = 0) {
            // x: [N, n_token, hidden_size]
            int64_t n_token = x->ne[1];
            int64_t N       = x->ne[2];
@ -412,8 +484,8 @@ namespace LLM {
            v = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_kv_heads, n_token, N);  // [N, n_token, num_kv_heads, head_dim]

            if (qk_norm) {
-                auto q_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
-                auto k_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
+                auto q_norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["q_norm"]);
+                auto k_norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["k_norm"]);

                q = q_norm->forward(ctx, q);
                k = k_norm->forward(ctx, k);
@ -428,6 +500,36 @@ namespace LLM {
            } else if (arch == LLMArch::QWEN3) {
                q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
                k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
+            } else if (arch == LLMArch::GEMMA3_12B) {
+                float rope_theta = (rope_index == 1 ? 10000.0f : 1000000.0f);
+                float rope_scale = (rope_index == 1 ? 1.f : 8.f);
+                float freq_scale = 1.f / rope_scale;
+                q                = ggml_rope_ext(ctx->ggml_ctx,
+                                                 q,
+                                                 input_pos,
+                                                 nullptr,
+                                                 head_dim,
+                                                 GGML_ROPE_TYPE_NORMAL,
+                                                 0,
+                                                 rope_theta,
+                                                 freq_scale,
+                                                 0.f,
+                                                 1.f,
+                                                 32.f,
+                                                 1.f);
+                k                = ggml_rope_ext(ctx->ggml_ctx,
+                                                 k,
+                                                 input_pos,
+                                                 nullptr,
+                                                 head_dim,
+                                                 GGML_ROPE_TYPE_NORMAL,
+                                                 0,
+                                                 rope_theta,
+                                                 freq_scale,
+                                                 0.f,
+                                                 1.f,
+                                                 32.f,
+                                                 1.f);
            } else {
                int sections[4] = {16, 24, 24, 0};
                q               = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
@ -448,32 +550,75 @@ namespace LLM {
    };

    struct TransformerBlock : public GGMLBlock {
+    protected:
+        LLMArch arch;
+        int sliding_attention;
+        bool has_post_attention_norm;
+        bool has_post_ffw_norm;
+
    public:
-        TransformerBlock(const LLMParams& params) {
+        TransformerBlock(const LLMParams& params, int layer_index)
+            : arch(params.arch),
+              sliding_attention(0),
+              has_post_attention_norm(params.arch == LLMArch::GEMMA3_12B),
+              has_post_ffw_norm(params.arch == LLMArch::GEMMA3_12B) {
            blocks["self_attn"]                = std::make_shared<Attention>(params);
-            blocks["mlp"]                      = std::make_shared<MLP>(params.hidden_size, params.intermediate_size);
-            blocks["input_layernorm"]          = std::make_shared<RMSNorm>(params.hidden_size, params.rms_norm_eps);
-            blocks["post_attention_layernorm"] = std::make_shared<RMSNorm>(params.hidden_size, params.rms_norm_eps);
+            blocks["mlp"]                      = std::make_shared<MLP>(params.hidden_size,
+                                                  params.intermediate_size,
+                                                  false,
+                                                  params.mlp_activation);
+            blocks["input_layernorm"]          = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
+            blocks["post_attention_layernorm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
+            if (has_post_attention_norm) {
+                blocks["post_attention_norm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
+            }
+            if (has_post_ffw_norm) {
+                blocks["post_ffw_norm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
+            }
+            if (!params.sliding_attention.empty()) {
+                sliding_attention = params.sliding_attention[layer_index % params.sliding_attention.size()];
+            }
        }

        ggml_tensor* forward(GGMLRunnerContext* ctx,
                             ggml_tensor* x,
                             ggml_tensor* input_pos,
-                             ggml_tensor* attention_mask = nullptr) {
+                             ggml_tensor* attention_mask         = nullptr,
+                             ggml_tensor* sliding_attention_mask = nullptr) {
            // x: [N, n_token, hidden_size]
            auto self_attn                                  = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
            auto mlp                                        = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
-            auto input_layernorm          = std::dynamic_pointer_cast<RMSNorm>(blocks["input_layernorm"]);
-            auto post_attention_layernorm = std::dynamic_pointer_cast<RMSNorm>(blocks["post_attention_layernorm"]);
+            auto input_layernorm                            = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["input_layernorm"]);
+            auto post_attention_layernorm                   = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["post_attention_layernorm"]);
+            std::shared_ptr<LLMRMSNorm> post_attention_norm = nullptr;
+            std::shared_ptr<LLMRMSNorm> post_ffw_norm       = nullptr;
+            if (has_post_attention_norm) {
+                post_attention_norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["post_attention_norm"]);
+            }
+            if (has_post_ffw_norm) {
+                post_ffw_norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["post_ffw_norm"]);
+            }
+            ggml_tensor* block_attention_mask = attention_mask;
+            int rope_index                    = 0;
+            if (arch == LLMArch::GEMMA3_12B && sliding_attention > 0) {
+                block_attention_mask = sliding_attention_mask;
+                rope_index           = 1;
+            }

            auto residual = x;
            x             = input_layernorm->forward(ctx, x);
-            x             = self_attn->forward(ctx, x, input_pos, attention_mask);
+            x             = self_attn->forward(ctx, x, input_pos, block_attention_mask, rope_index);
+            if (post_attention_norm != nullptr) {
+                x = post_attention_norm->forward(ctx, x);
+            }
            x = ggml_add_inplace(ctx->ggml_ctx, x, residual);

            residual = x;
            x        = post_attention_layernorm->forward(ctx, x);
            x        = mlp->forward(ctx, x);
+            if (post_ffw_norm != nullptr) {
+                x = post_ffw_norm->forward(ctx, x);
+            }
            x = ggml_add_inplace(ctx->ggml_ctx, x, residual);

            return x;
@ -483,28 +628,36 @@ namespace LLM {
    struct TextModel : public GGMLBlock {
    protected:
        int64_t num_layers;
+        int64_t hidden_size;
+        bool normalize_input;
+        float input_scale;

    public:
        TextModel(const LLMParams& params)
-            : num_layers(params.num_layers) {
+            : num_layers(params.num_layers),
+              hidden_size(params.hidden_size),
+              normalize_input(params.normalize_input),
+              input_scale(std::sqrt(static_cast<float>(params.hidden_size))) {
            blocks["embed_tokens"] = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size, params.hidden_size));
            for (int i = 0; i < num_layers; i++) {
-                blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new TransformerBlock(params));
+                blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new TransformerBlock(params, i));
            }
-            blocks["norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(params.hidden_size, params.rms_norm_eps));
+            blocks["norm"] = std::shared_ptr<GGMLBlock>(new LLMRMSNorm(params.hidden_size, params.rms_norm_eps, params.rms_norm_add));
        }

        ggml_tensor* forward(GGMLRunnerContext* ctx,
                             ggml_tensor* input_ids,
                             ggml_tensor* input_pos,
                             ggml_tensor* attention_mask,
+                             ggml_tensor* sliding_attention_mask,
                             std::vector<std::pair<int, ggml_tensor*>> image_embeds,
-                             std::set<int> out_layers) {
+                             std::set<int> out_layers,
+                             bool return_all_hidden_states = false) {
            // input_ids: [N, n_token]
            // return: [N, n_token, hidden_size]

            auto embed_tokens = std::dynamic_pointer_cast<Embedding>(blocks["embed_tokens"]);
-            auto norm         = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
+            auto norm         = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["norm"]);

            auto x = embed_tokens->forward(ctx, input_ids);

@ -549,22 +702,44 @@ namespace LLM {
                x = input_embed;
            }

+            if (normalize_input) {
+                x = ggml_ext_scale(ctx->ggml_ctx, x, input_scale, true);
+            }
+
+            if (return_all_hidden_states) {
+                intermediate_outputs.push_back(x);
+            }
+
            for (int i = 0; i < num_layers; i++) {
                auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]);

-                x = block->forward(ctx, x, input_pos, attention_mask);
-                if (out_layers.find(i + 1) != out_layers.end()) {
+                x = block->forward(ctx, x, input_pos, attention_mask, sliding_attention_mask);
+                if (return_all_hidden_states) {
+                    if (i + 1 < num_layers) {
+                        intermediate_outputs.push_back(x);
+                    }
+                } else if (out_layers.find(i + 1) != out_layers.end()) {
                    intermediate_outputs.push_back(x);
                }
            }

-            if (!intermediate_outputs.empty()) {
+            auto normed_x = norm->forward(ctx, x);
+            if (return_all_hidden_states) {
+                intermediate_outputs.push_back(normed_x);
+                x = intermediate_outputs[0];
+                for (int i = 1; i < intermediate_outputs.size(); i++) {
+                    x = ggml_concat(ctx->ggml_ctx, x, intermediate_outputs[i], 0);
+                }
+            } else if (!intermediate_outputs.empty()) {
+                if (out_layers.find(static_cast<int>(num_layers + 1)) != out_layers.end()) {
+                    intermediate_outputs.push_back(normed_x);
+                }
                x = intermediate_outputs[0];
                for (int i = 1; i < intermediate_outputs.size(); i++) {
                    x = ggml_concat(ctx->ggml_ctx, x, intermediate_outputs[i], 0);
                }
            } else {
-                x = norm->forward(ctx, x);
+                x = normed_x;
            }
            return x;
        }
@ -599,12 +774,21 @@ namespace LLM {
                             ggml_tensor* input_ids,
                             ggml_tensor* input_pos,
                             ggml_tensor* attention_mask,
+                             ggml_tensor* sliding_attention_mask,
                             std::vector<std::pair<int, ggml_tensor*>> image_embeds,
-                             std::set<int> out_layers) {
+                             std::set<int> out_layers,
+                             bool return_all_hidden_states = false) {
            // input_ids: [N, n_token]
            auto model = std::dynamic_pointer_cast<TextModel>(blocks["model"]);

-            auto x = model->forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
+            auto x = model->forward(ctx,
+                                    input_ids,
+                                    input_pos,
+                                    attention_mask,
+                                    sliding_attention_mask,
+                                    image_embeds,
+                                    out_layers,
+                                    return_all_hidden_states);
            return x;
        }

@ -627,6 +811,7 @@ namespace LLM {

        std::vector<int> input_pos_vec;
        std::vector<float> attention_mask_vec;
+        std::vector<float> sliding_attention_mask_vec;
        std::vector<float> window_mask_vec;
        std::vector<int> window_index_vec;
        std::vector<int> window_inverse_index_vec;
@ -653,6 +838,23 @@ namespace LLM {
                params.qkv_bias     = false;
                params.qk_norm      = true;
                params.rms_norm_eps = 1e-6f;
+            } else if (arch == LLMArch::GEMMA3_12B) {
+                params.head_dim     = 256;
+                params.num_heads    = 16;
+                params.num_kv_heads = 8;
+                params.qkv_bias     = false;
+                params.qk_norm      = true;
+                params.rms_norm_eps = 1e-6f;
+                // llama.cpp adds +1 to Gemma3 norm.weight when exporting GGUF, so GGUF loading
+                // must keep rms_norm_add disabled here or the offset gets applied twice.
+                // Convenient for the converter, less convenient for whoever gets to debug it later.
+                params.rms_norm_add            = false;
+                params.normalize_input         = true;
+                params.max_position_embeddings = 131072;
+                params.mlp_activation          = MLPActivation::GELU_TANH;
+                params.rope_thetas             = {1000000.f, 10000.f};
+                params.rope_scales             = {8.f, 1.f};
+                params.sliding_attention       = {1024, 1024, 1024, 1024, 1024, 0};
            }
            bool have_vision_weight = false;
            bool llama_cpp_style    = false;
@ -722,9 +924,18 @@ namespace LLM {
                             ggml_tensor* input_ids,
                             ggml_tensor* input_pos,
                             ggml_tensor* attention_mask,
+                             ggml_tensor* sliding_attention_mask,
                             std::vector<std::pair<int, ggml_tensor*>> image_embeds,
-                             std::set<int> out_layers) {
-            auto hidden_states = model.forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);  // [N, n_token, hidden_size]
+                             std::set<int> out_layers,
+                             bool return_all_hidden_states = false) {
+            auto hidden_states = model.forward(ctx,
+                                               input_ids,
+                                               input_pos,
+                                               attention_mask,
+                                               sliding_attention_mask,
+                                               image_embeds,
+                                               out_layers,
+                                               return_all_hidden_states);  // [N, n_token, hidden_size]
            return hidden_states;
        }

@ -741,8 +952,9 @@ namespace LLM {
        ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
                                 const sd::Tensor<float>& attention_mask_tensor,
                                 const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds_tensor,
-                                 std::set<int> out_layers) {
-            ggml_cgraph* gf        = ggml_new_graph(compute_ctx);
+                                 std::set<int> out_layers,
+                                 bool return_all_hidden_states = false) {
+            ggml_cgraph* gf        = new_graph_custom(LLM_GRAPH_SIZE);
            ggml_tensor* input_ids = make_input(input_ids_tensor);
            std::vector<std::pair<int, ggml_tensor*>> image_embeds;
            image_embeds.reserve(image_embeds_tensor.size());
@ -752,7 +964,10 @@ namespace LLM {
            }

            int64_t n_tokens = input_ids->ne[0];
-            if (params.arch == LLMArch::MISTRAL_SMALL_3_2 || params.arch == LLMArch::MINISTRAL_3_3B || params.arch == LLMArch::QWEN3) {
+            if (params.arch == LLMArch::MISTRAL_SMALL_3_2 ||
+                params.arch == LLMArch::MINISTRAL_3_3B ||
+                params.arch == LLMArch::QWEN3 ||
+                params.arch == LLMArch::GEMMA3_12B) {
                input_pos_vec.resize(n_tokens);
                for (int i = 0; i < n_tokens; ++i) {
                    input_pos_vec[i] = i;
@ -773,6 +988,7 @@ namespace LLM {
            set_backend_tensor_data(input_pos, input_pos_vec.data());

            ggml_tensor* attention_mask         = nullptr;
+            ggml_tensor* sliding_attention_mask = nullptr;
            if (!attention_mask_tensor.empty()) {
                attention_mask = make_input(attention_mask_tensor);
            } else {
@ -790,9 +1006,36 @@ namespace LLM {
                set_backend_tensor_data(attention_mask, attention_mask_vec.data());
            }

+            if (params.arch == LLMArch::GEMMA3_12B) {
+                sliding_attention_mask_vec.resize(n_tokens * n_tokens);
+                if (!attention_mask_tensor.empty()) {
+                    GGML_ASSERT(attention_mask_tensor.numel() == n_tokens * n_tokens);
+                    sliding_attention_mask_vec = attention_mask_tensor.values();
+                } else {
+                    sliding_attention_mask_vec = attention_mask_vec;
+                }
+                for (int i0 = 0; i0 < n_tokens; i0++) {
+                    for (int i1 = 0; i1 < n_tokens; i1++) {
+                        if (i0 + 1024 <= i1) {
+                            LOG_DEBUG("xxxxxxxxxxxxxx");
+                            sliding_attention_mask_vec[i1 * n_tokens + i0] = -INFINITY;
+                        }
+                    }
+                }
+                sliding_attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_tokens, n_tokens);
+                set_backend_tensor_data(sliding_attention_mask, sliding_attention_mask_vec.data());
+            }
+
            auto runner_ctx = get_context();

-            ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
+            ggml_tensor* hidden_states = forward(&runner_ctx,
+                                                 input_ids,
+                                                 input_pos,
+                                                 attention_mask,
+                                                 sliding_attention_mask,
+                                                 image_embeds,
+                                                 out_layers,
+                                                 return_all_hidden_states);

            ggml_build_forward_expand(gf, hidden_states);

@ -803,9 +1046,14 @@ namespace LLM {
                                  const sd::Tensor<int32_t>& input_ids,
                                  const sd::Tensor<float>& attention_mask,
                                  const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
-                                  std::set<int> out_layers) {
+                                  std::set<int> out_layers,
+                                  bool return_all_hidden_states = false) {
            auto get_graph = [&]() -> ggml_cgraph* {
-                return build_graph(input_ids, attention_mask, image_embeds, out_layers);
+                return build_graph(input_ids,
+                                   attention_mask,
+                                   image_embeds,
+                                   out_layers,
+                                   return_all_hidden_states);
            };
            return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
        }
--- a/src/ltx_vae.h
+++ b/src/ltx_vae.h
@ -0,0 +1,970 @@
+#ifndef __SD_LTX_VAE_H__
+#define __SD_LTX_VAE_H__
+
+#include <fstream>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "ltxv.hpp"
+#include "vae.hpp"
+#include "wan.hpp"
+
+namespace LTXVAE {
+
+    static inline ggml_tensor* apply_scale_shift(ggml_context* ctx,
+                                                 ggml_tensor* x,
+                                                 ggml_tensor* scale,
+                                                 ggml_tensor* shift) {
+        x = ggml_add(ctx, x, ggml_mul(ctx, x, scale));
+        x = ggml_add(ctx, x, shift);
+        return x;
+    }
+
+    static inline ggml_tensor* reshape_channel_broadcast(ggml_context* ctx,
+                                                         ggml_tensor* x) {
+        return ggml_reshape_4d(ctx, x, 1, 1, 1, ggml_nelements(x));
+    }
+
+    static inline std::pair<ggml_tensor*, ggml_tensor*> get_shift_scale(ggml_context* ctx,
+                                                                        ggml_tensor* table,
+                                                                        ggml_tensor* timestep,
+                                                                        int64_t channels,
+                                                                        int parts) {
+        GGML_ASSERT(timestep != nullptr);
+        GGML_ASSERT(ggml_nelements(timestep) == channels * parts);
+
+        auto timestep_view = ggml_reshape_2d(ctx, timestep, channels, parts);
+        auto values        = ggml_add(ctx, table, timestep_view);
+        auto chunks        = ggml_ext_chunk(ctx, values, parts, 1, false);
+        auto shift         = reshape_channel_broadcast(ctx, ggml_cont(ctx, chunks[0]));
+        auto scale         = reshape_channel_broadcast(ctx, ggml_cont(ctx, chunks[1]));
+        return {shift, scale};
+    }
+
+    static inline ggml_tensor* depth_to_space_3d(ggml_context* ctx,
+                                                 ggml_tensor* x,
+                                                 int64_t c,
+                                                 int factor_t,
+                                                 int factor_s,
+                                                 bool drop_first_temporal_frame) {
+        // x: [B*c*p1*p2*p3, T, H, W], B == 1, p2 == p3 == factor_s, p1 == factor_t
+        // return: [B*c, T*p1, H*p2, W*p2]
+        // Match: rearrange(x, "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)")
+        const int64_t T = x->ne[2];
+        const int64_t H = x->ne[1];
+        const int64_t W = x->ne[0];
+
+        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 1, 3, 2));        // [T, C, H, W]
+        x = ggml_reshape_4d(ctx, x, W, H, factor_s, factor_s * factor_t * c * T);  // [T*c*p1*p2, p3, H, W]
+        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3));        // [T*c*p1*p2, H, W, p3]
+        x = ggml_reshape_4d(ctx, x, factor_s * W, H, factor_s, factor_t * c * T);  // [T*c*p1, p2, H, W*p3]
+        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));        // [T*c*p1, H, p2, W*p3]
+        x = ggml_reshape_4d(ctx, x, factor_s * W * factor_s * H, factor_t, c, T);  // [T, c, p1, H*p2*W*p3]
+        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 1, 3, 2));        // [c, T, p1, H*p2*W*p3]
+        x = ggml_reshape_4d(ctx, x, factor_s * W, factor_s * H, factor_t * T, c);  // [T, c, T*p1, H*p2*W*p3]
+
+        if (drop_first_temporal_frame && factor_t > 1 && x->ne[2] > 0) {
+            x = ggml_ext_slice(ctx, x, 2, 1, x->ne[2]);
+        }
+
+        return x;
+    }
+
+    static inline ggml_tensor* patchify(ggml_context* ctx,
+                                        ggml_tensor* x,
+                                        int patch_size) {
+        return WAN::WanVAE::patchify(ctx, x, patch_size, 1);
+    }
+
+    class CausalConv3d : public GGMLBlock {
+    protected:
+        int time_kernel_size;
+
+    public:
+        CausalConv3d(int64_t in_channels,
+                     int64_t out_channels,
+                     int kernel_size                  = 3,
+                     std::tuple<int, int, int> stride = {1, 1, 1},
+                     int dilation                     = 1,
+                     bool bias                        = true) {
+            time_kernel_size = kernel_size;
+            blocks["conv"]   = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
+                                                                     out_channels,
+                                                                     {kernel_size, kernel_size, kernel_size},
+                                                                     stride,
+                                                                     {0, kernel_size / 2, kernel_size / 2},
+                                                                     {dilation, 1, 1},
+                                                                     bias));
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             bool causal = true) {
+            // x: [B*C, T, H, W], B == 1
+            auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);
+
+            if (causal) {
+                auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
+                auto first_frame_pad = first_frame;
+                for (int i = 1; i < time_kernel_size - 1; i++) {
+                    first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
+                }
+                x = ggml_concat(ctx->ggml_ctx, first_frame_pad, x, 2);
+            } else {
+                auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
+                auto first_frame_pad = first_frame;
+                for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
+                    first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
+                }
+
+                auto last_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, x->ne[2] - 1, x->ne[2]);
+                auto last_frame_pad = last_frame;
+                for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
+                    last_frame_pad = ggml_concat(ctx->ggml_ctx, last_frame_pad, last_frame, 2);
+                }
+                x = ggml_concat(ctx->ggml_ctx, first_frame_pad, x, 2);
+                x = ggml_concat(ctx->ggml_ctx, x, last_frame_pad, 2);
+            }
+            return conv->forward(ctx, x);
+        }
+    };
+
+    struct PixelNorm3D : public UnaryBlock {
+        float eps;
+
+        PixelNorm3D(float eps = 1e-8f)
+            : eps(eps) {}
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+            auto h = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 3, 0, 1, 2));
+            h      = ggml_rms_norm(ctx->ggml_ctx, h, eps);
+            h      = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, h, 1, 2, 3, 0));
+            return h;
+        }
+    };
+
+    struct PixArtAlphaCombinedTimestepSizeEmbeddings : public GGMLBlock {
+        int64_t embedding_dim;
+
+        PixArtAlphaCombinedTimestepSizeEmbeddings(int64_t embedding_dim)
+            : embedding_dim(embedding_dim) {
+            blocks["timestep_embedder"] = std::make_shared<LTXV::TimestepEmbedder>(embedding_dim);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* timestep) {
+            auto timestep_embedder = std::dynamic_pointer_cast<LTXV::TimestepEmbedder>(blocks["timestep_embedder"]);
+            return timestep_embedder->forward(ctx, timestep);
+        }
+    };
+
+    struct ResnetBlock3D : public GGMLBlock {
+        int64_t channels;
+        bool timestep_conditioning;
+
+    protected:
+        void init_params(ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         const std::string prefix                       = "") override {
+            if (timestep_conditioning) {
+                params["scale_shift_table"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, channels, 4);
+            }
+        }
+
+    public:
+        ResnetBlock3D(int64_t channels,
+                    float eps = 1e-6f,
+                      bool timestep_conditioning = false)
+            : channels(channels), timestep_conditioning(timestep_conditioning) {
+            blocks["norm1"] = std::make_shared<PixelNorm3D>(eps);
+            blocks["conv1"] = std::make_shared<CausalConv3d>(channels, channels, 3);
+            blocks["norm2"] = std::make_shared<PixelNorm3D>(eps);
+            blocks["conv2"] = std::make_shared<CausalConv3d>(channels, channels, 3);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timestep = nullptr,
+                             bool causal           = false) {
+            auto norm1 = std::dynamic_pointer_cast<PixelNorm3D>(blocks["norm1"]);
+            auto conv1 = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv1"]);
+            auto norm2 = std::dynamic_pointer_cast<PixelNorm3D>(blocks["norm2"]);
+            auto conv2 = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv2"]);
+
+            ggml_tensor* shift1 = nullptr;
+            ggml_tensor* scale1 = nullptr;
+            ggml_tensor* shift2 = nullptr;
+            ggml_tensor* scale2 = nullptr;
+            if (timestep_conditioning) {
+                GGML_ASSERT(timestep != nullptr);
+                auto values = ggml_add(ctx->ggml_ctx,
+                                       params["scale_shift_table"],
+                                       ggml_reshape_2d(ctx->ggml_ctx, timestep, channels, 4));
+                auto chunks = ggml_ext_chunk(ctx->ggml_ctx, values, 4, 1, false);
+                shift1      = reshape_channel_broadcast(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, chunks[0]));
+                scale1      = reshape_channel_broadcast(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, chunks[1]));
+                shift2      = reshape_channel_broadcast(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, chunks[2]));
+                scale2      = reshape_channel_broadcast(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, chunks[3]));
+            }
+
+            auto h = norm1->forward(ctx, x);
+            if (timestep_conditioning) {
+                h = apply_scale_shift(ctx->ggml_ctx, h, scale1, shift1);
+            }
+            h = ggml_silu_inplace(ctx->ggml_ctx, h);
+            h = conv1->forward(ctx, h, causal);
+
+            h = norm2->forward(ctx, h);
+            if (timestep_conditioning) {
+                h = apply_scale_shift(ctx->ggml_ctx, h, scale2, shift2);
+            }
+            h = ggml_silu_inplace(ctx->ggml_ctx, h);
+            h = conv2->forward(ctx, h, causal);
+
+            return ggml_add(ctx->ggml_ctx, h, x);
+        }
+    };
+
+    struct UNetMidBlock3D : public GGMLBlock {
+        int64_t channels;
+        int num_layers;
+        bool timestep_conditioning;
+
+        UNetMidBlock3D(int64_t channels,
+                       int num_layers,
+                       bool timestep_conditioning)
+            : channels(channels),
+              num_layers(num_layers),
+              timestep_conditioning(timestep_conditioning) {
+            if (timestep_conditioning) {
+                blocks["time_embedder"] = std::make_shared<PixArtAlphaCombinedTimestepSizeEmbeddings>(channels * 4);
+            }
+            for (int i = 0; i < num_layers; i++) {
+                blocks["res_blocks." + std::to_string(i)] = std::make_shared<ResnetBlock3D>(channels, 1e-6f, timestep_conditioning);
+            }
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timestep = nullptr,
+                             bool causal           = false) {
+            ggml_tensor* timestep_embed = nullptr;
+            if (timestep_conditioning) {
+                GGML_ASSERT(timestep != nullptr);
+                auto time_embedder = std::dynamic_pointer_cast<PixArtAlphaCombinedTimestepSizeEmbeddings>(blocks["time_embedder"]);
+                timestep_embed     = time_embedder->forward(ctx, timestep);
+            }
+
+            for (int i = 0; i < num_layers; i++) {
+                auto resnet = std::dynamic_pointer_cast<ResnetBlock3D>(blocks["res_blocks." + std::to_string(i)]);
+                x           = resnet->forward(ctx, x, timestep_embed, causal);
+            }
+            return x;
+        }
+    };
+
+    struct DepthToSpaceUpsample : public GGMLBlock {
+        int64_t in_channels;
+        int factor_t;
+        int factor_s;
+        int out_channels_reduction_factor;
+        bool residual;
+
+        DepthToSpaceUpsample(int64_t in_channels,
+                             int factor_t                      = 2,
+                             int factor_s                      = 2,
+                             int out_channels_reduction_factor = 2,
+                             bool residual                     = true)
+            : in_channels(in_channels),
+              factor_t(factor_t),
+              factor_s(factor_s),
+              out_channels_reduction_factor(out_channels_reduction_factor),
+              residual(residual) {
+            const int64_t factor  = static_cast<int64_t>(factor_t) * static_cast<int64_t>(factor_s) * static_cast<int64_t>(factor_s);
+            const int64_t out_dim = (factor * in_channels) / out_channels_reduction_factor;
+            blocks["conv"]        = std::make_shared<CausalConv3d>(in_channels, out_dim, 3);
+        }
+
+        int64_t get_output_channels() const {
+            return in_channels / out_channels_reduction_factor;
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             bool causal = false) {
+            auto conv = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv"]);
+
+            ggml_tensor* x_in = nullptr;
+            if (residual) {
+                x_in       = depth_to_space_3d(ctx->ggml_ctx, x, in_channels / (factor_t * factor_s * factor_s), factor_t, factor_s, factor_t > 1);
+                int repeat = (factor_t * factor_s * factor_s) / out_channels_reduction_factor;
+                auto res   = x_in;
+                for (int i = 1; i < repeat; i++) {
+                    res = ggml_concat(ctx->ggml_ctx, res, x_in, 3);
+                }
+                x_in = res;
+            }
+
+            x = conv->forward(ctx, x, causal);
+            x = depth_to_space_3d(ctx->ggml_ctx, x, get_output_channels(), factor_t, factor_s, factor_t > 1);
+            if (residual) {
+                x = ggml_add(ctx->ggml_ctx, x, x_in);
+            }
+            return x;
+        }
+    };
+
+    struct SpaceToDepthDownsample : public GGMLBlock {
+        int64_t in_channels;
+        int64_t out_channels;
+        int factor_t;
+        int factor_s;
+
+        SpaceToDepthDownsample(int64_t in_channels,
+                               int64_t out_channels,
+                               int factor_t,
+                               int factor_s)
+            : in_channels(in_channels),
+              out_channels(out_channels),
+              factor_t(factor_t),
+              factor_s(factor_s) {
+            const int64_t factor = static_cast<int64_t>(factor_t) * static_cast<int64_t>(factor_s) * static_cast<int64_t>(factor_s);
+            GGML_ASSERT(out_channels % factor == 0);
+
+            blocks["conv"]             = std::make_shared<CausalConv3d>(in_channels, out_channels / factor, 3);
+            blocks["skip_downsample"]  = std::make_shared<WAN::AvgDown3D>(in_channels, out_channels, factor_t, factor_s);
+            blocks["conv_downsample"]  = std::make_shared<WAN::AvgDown3D>(out_channels / factor, out_channels, factor_t, factor_s);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             bool causal = true) {
+            auto conv            = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv"]);
+            auto skip_downsample = std::dynamic_pointer_cast<WAN::AvgDown3D>(blocks["skip_downsample"]);
+            auto conv_downsample = std::dynamic_pointer_cast<WAN::AvgDown3D>(blocks["conv_downsample"]);
+
+            if (factor_t > 1 && x->ne[2] > 0) {
+                auto first_frame     = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
+                auto first_frame_pad = first_frame;
+                for (int i = 1; i < factor_t; ++i) {
+                    first_frame_pad = ggml_concat(ctx->ggml_ctx, first_frame_pad, first_frame, 2);
+                }
+                x = ggml_concat(ctx->ggml_ctx, first_frame_pad, x, 2);
+            }
+
+            auto residual = skip_downsample->forward(ctx, x);
+            auto h        = conv->forward(ctx, x, causal);
+            h             = conv_downsample->forward(ctx, h);
+            return ggml_add(ctx->ggml_ctx, h, residual);
+        }
+    };
+
+    struct PerChannelStatistics : public GGMLBlock {
+    protected:
+        void init_params(ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         const std::string prefix                       = "") override {
+            params["std-of-means"]  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128);
+            params["mean-of-means"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128);
+        }
+
+    public:
+        ggml_tensor* un_normalize(GGMLRunnerContext* ctx,
+                                  ggml_tensor* x) {
+            auto std_tensor  = reshape_channel_broadcast(ctx->ggml_ctx, params["std-of-means"]);
+            auto mean_tensor = reshape_channel_broadcast(ctx->ggml_ctx, params["mean-of-means"]);
+            return ggml_add(ctx->ggml_ctx, ggml_mul(ctx->ggml_ctx, x, std_tensor), mean_tensor);
+        }
+
+        ggml_tensor* normalize(GGMLRunnerContext* ctx,
+                               ggml_tensor* x) {
+            auto std_tensor  = reshape_channel_broadcast(ctx->ggml_ctx, params["std-of-means"]);
+            auto mean_tensor = reshape_channel_broadcast(ctx->ggml_ctx, params["mean-of-means"]);
+            return ggml_div(ctx->ggml_ctx, ggml_sub(ctx->ggml_ctx, x, mean_tensor), std_tensor);
+        }
+    };
+
+    struct DecoderConfig {
+        struct Block {
+            std::string type;
+            int num_layers = 0;
+            int multiplier = 1;
+        };
+
+        std::vector<Block> blocks;
+    };
+
+    struct EncoderConfig {
+        struct Block {
+            std::string type;
+            int num_layers = 0;
+            int multiplier = 1;
+        };
+
+        std::vector<Block> blocks;
+    };
+
+    static inline bool has_tensor(const String2TensorStorage& tensor_storage_map,
+                                  const std::string& name) {
+        return tensor_storage_map.find(name) != tensor_storage_map.end();
+    }
+
+    static inline int64_t get_tensor_ne0(const String2TensorStorage& tensor_storage_map,
+                                         const std::string& name,
+                                         int64_t fallback = 0) {
+        auto iter = tensor_storage_map.find(name);
+        if (iter == tensor_storage_map.end()) {
+            return fallback;
+        }
+        return iter->second.ne[0];
+    }
+
+    static inline DecoderConfig infer_decoder_config_from_weights(const String2TensorStorage& tensor_storage_map,
+                                                                  const std::string& prefix,
+                                                                  int64_t conv_in_channels) {
+        DecoderConfig cfg;
+        const std::string decoder_prefix = prefix + ".decoder.up_blocks.";
+
+        int64_t current_channels = conv_in_channels;
+        for (int block_idx = 0;; ++block_idx) {
+            const std::string block_prefix = decoder_prefix + std::to_string(block_idx);
+            const std::string res0_bias    = block_prefix + ".res_blocks.0.conv1.conv.bias";
+            const std::string conv_bias    = block_prefix + ".conv.conv.bias";
+
+            if (has_tensor(tensor_storage_map, res0_bias)) {
+                int num_layers = 0;
+                while (has_tensor(tensor_storage_map,
+                                  block_prefix + ".res_blocks." + std::to_string(num_layers) + ".conv1.conv.bias")) {
+                    num_layers++;
+                }
+                cfg.blocks.push_back({"res_x", num_layers, 1});
+                current_channels = get_tensor_ne0(tensor_storage_map, res0_bias, current_channels);
+                continue;
+            }
+
+            if (!has_tensor(tensor_storage_map, conv_bias)) {
+                break;
+            }
+
+            int64_t next_channels = 0;
+            for (int next_idx = block_idx + 1;; ++next_idx) {
+                const std::string next_res0_bias = decoder_prefix + std::to_string(next_idx) + ".res_blocks.0.conv1.conv.bias";
+                const std::string next_conv_bias = decoder_prefix + std::to_string(next_idx) + ".conv.conv.bias";
+                if (has_tensor(tensor_storage_map, next_res0_bias)) {
+                    next_channels = get_tensor_ne0(tensor_storage_map, next_res0_bias);
+                    break;
+                }
+                if (!has_tensor(tensor_storage_map, next_conv_bias)) {
+                    break;
+                }
+            }
+            if (next_channels <= 0 || current_channels % next_channels != 0) {
+                next_channels = std::max<int64_t>(1, current_channels / 2);
+            }
+
+            const int64_t conv_out_dim = get_tensor_ne0(tensor_storage_map, conv_bias);
+            const int64_t reduction    = std::max<int64_t>(1, current_channels / next_channels);
+            const int64_t factor       = next_channels > 0 ? conv_out_dim / next_channels : 0;
+
+            if (factor == 8) {
+                cfg.blocks.push_back({"compress_all", 0, static_cast<int>(reduction)});
+            } else if (factor == 4) {
+                cfg.blocks.push_back({"compress_space", 0, static_cast<int>(reduction)});
+            } else if (factor == 2) {
+                cfg.blocks.push_back({"compress_time", 0, static_cast<int>(reduction)});
+            } else {
+                LOG_WARN("unexpected LTX VAE upsample factor at '%s': conv_out=%lld current=%lld next=%lld, falling back to compress_all x%d",
+                         block_prefix.c_str(),
+                         (long long)conv_out_dim,
+                         (long long)current_channels,
+                         (long long)next_channels,
+                         (int)reduction);
+                cfg.blocks.push_back({"compress_all", 0, static_cast<int>(reduction)});
+            }
+            current_channels = next_channels;
+        }
+
+        return cfg;
+    }
+
+    static inline int detect_ltx_vae_version(const String2TensorStorage& tensor_storage_map,
+                                             const std::string& prefix) {
+        const std::string v2_probe = prefix + ".encoder.down_blocks.1.conv.conv.bias";
+        if (tensor_storage_map.find(v2_probe) != tensor_storage_map.end()) {
+            return 2;
+        }
+        return 1;
+    }
+
+    static inline bool detect_ltx_vae_timestep_conditioning(const String2TensorStorage& tensor_storage_map,
+                                                            const std::string& prefix) {
+        return tensor_storage_map.find(prefix + ".decoder.timestep_scale_multiplier") != tensor_storage_map.end();
+    }
+
+    static inline EncoderConfig get_encoder_config(int version) {
+        EncoderConfig cfg;
+        if (version < 2) {
+            GGML_ABORT("LTX VAE encoder is only implemented for version >= 2");
+        }
+
+        cfg.blocks = {
+            {"res_x", 4, 1},
+            {"compress_space_res", 0, 2},
+            {"res_x", 6, 1},
+            {"compress_time_res", 0, 2},
+            {"res_x", 6, 1},
+            {"compress_all_res", 0, 2},
+            {"res_x", 2, 1},
+            {"compress_all_res", 0, 2},
+            {"res_x", 2, 1},
+        };
+        return cfg;
+    }
+
+    struct Encoder : public GGMLBlock {
+        int version;
+        int patch_size;
+        int64_t in_channels;
+        int64_t latent_channels;
+
+        Encoder(int version,
+                int patch_size      = 4,
+                int64_t in_channels = 3,
+                int64_t latent_channels = 128)
+            : version(version),
+              patch_size(patch_size),
+              in_channels(in_channels),
+              latent_channels(latent_channels) {
+            auto cfg          = get_encoder_config(version);
+            int64_t channels  = 128;
+            int64_t in_dim    = in_channels * patch_size * patch_size;
+
+            blocks["conv_in"] = std::make_shared<CausalConv3d>(in_dim, channels, 3);
+
+            for (int block_idx = 0; block_idx < static_cast<int>(cfg.blocks.size()); ++block_idx) {
+                const auto& block = cfg.blocks[block_idx];
+                if (block.type == "res_x") {
+                    blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<UNetMidBlock3D>(channels,
+                                                                                                           block.num_layers,
+                                                                                                           false);
+                } else if (block.type == "compress_space_res") {
+                    int64_t next_channels = channels * block.multiplier;
+                    blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
+                                                                                                                   next_channels,
+                                                                                                                   1,
+                                                                                                                   2);
+                    channels = next_channels;
+                } else if (block.type == "compress_time_res") {
+                    int64_t next_channels = channels * block.multiplier;
+                    blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
+                                                                                                                   next_channels,
+                                                                                                                   2,
+                                                                                                                   1);
+                    channels = next_channels;
+                } else if (block.type == "compress_all_res") {
+                    int64_t next_channels = channels * block.multiplier;
+                    blocks["down_blocks." + std::to_string(block_idx)] = std::make_shared<SpaceToDepthDownsample>(channels,
+                                                                                                                   next_channels,
+                                                                                                                   2,
+                                                                                                                   2);
+                    channels = next_channels;
+                } else {
+                    GGML_ABORT("Unsupported LTX VAE encoder block");
+                }
+            }
+
+            blocks["conv_norm_out"] = std::make_shared<PixelNorm3D>();
+            blocks["conv_out"]      = std::make_shared<CausalConv3d>(channels, latent_channels + 1, 3);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x) {
+            auto conv_in       = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv_in"]);
+            auto conv_norm_out = std::dynamic_pointer_cast<PixelNorm3D>(blocks["conv_norm_out"]);
+            auto conv_out      = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv_out"]);
+
+            x = conv_in->forward(ctx, x, true);
+
+            int block_idx = 0;
+            while (blocks.find("down_blocks." + std::to_string(block_idx)) != blocks.end()) {
+                auto mid_block = std::dynamic_pointer_cast<UNetMidBlock3D>(blocks["down_blocks." + std::to_string(block_idx)]);
+                if (mid_block) {
+                    x = mid_block->forward(ctx, x, nullptr, true);
+                } else {
+                    auto downsample = std::dynamic_pointer_cast<SpaceToDepthDownsample>(blocks["down_blocks." + std::to_string(block_idx)]);
+                    x               = downsample->forward(ctx, x, true);
+                }
+                block_idx++;
+            }
+
+            x = conv_norm_out->forward(ctx, x);
+            x = ggml_silu_inplace(ctx->ggml_ctx, x);
+            x = conv_out->forward(ctx, x, true);
+
+            auto last_channel = ggml_ext_slice(ctx->ggml_ctx, x, 3, x->ne[3] - 1, x->ne[3]);
+            auto repeat_shape = ggml_new_tensor_4d(ctx->ggml_ctx, last_channel->type, last_channel->ne[0], last_channel->ne[1], last_channel->ne[2], latent_channels - 1);
+            auto repeated     = ggml_repeat(ctx->ggml_ctx, last_channel, repeat_shape);
+            return ggml_concat(ctx->ggml_ctx, x, repeated, 3);
+        }
+    };
+
+    struct Decoder : public GGMLBlock {
+        int version;
+        int patch_size;
+        bool causal_decoder;
+        bool timestep_conditioning;
+        int64_t in_channels;
+        int64_t hidden_channels;
+
+    protected:
+        void init_params(ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         const std::string prefix                       = "") override {
+            if (timestep_conditioning) {
+                params["timestep_scale_multiplier"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+                params["last_scale_shift_table"]    = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_channels, 2);
+            }
+        }
+
+    public:
+        Decoder(int version,
+                const String2TensorStorage& tensor_storage_map,
+                const std::string& prefix,
+                int patch_size             = 4,
+                bool causal_decoder        = false,
+                bool timestep_conditioning = true,
+                int64_t in_channels        = 128,
+                int64_t hidden_channels    = 128)
+            : version(version),
+              patch_size(patch_size),
+              causal_decoder(causal_decoder),
+              timestep_conditioning(timestep_conditioning),
+              in_channels(in_channels),
+              hidden_channels(hidden_channels) {
+            const int64_t conv_in_out_channels = get_tensor_ne0(tensor_storage_map,
+                                                                prefix + ".decoder.conv_in.conv.bias",
+                                                                hidden_channels);
+            auto cfg                           = infer_decoder_config_from_weights(tensor_storage_map,
+                                                                                   prefix,
+                                                                                   conv_in_out_channels);
+            int64_t channels                   = conv_in_out_channels;
+
+            blocks["conv_in"] = std::make_shared<CausalConv3d>(in_channels, channels, 3);
+
+            for (int block_idx = 0; block_idx < static_cast<int>(cfg.blocks.size()); ++block_idx) {
+                const auto& block = cfg.blocks[block_idx];
+                if (block.type == "res_x") {
+                    blocks["up_blocks." + std::to_string(block_idx)] = std::make_shared<UNetMidBlock3D>(channels,
+                                                                                                        block.num_layers,
+                                                                                                        timestep_conditioning);
+                } else if (block.type == "compress_all") {
+                    blocks["up_blocks." + std::to_string(block_idx)] = std::make_shared<DepthToSpaceUpsample>(channels,
+                                                                                                              2,
+                                                                                                              2,
+                                                                                                              block.multiplier,
+                                                                                                              false);
+                    channels /= block.multiplier;
+                } else if (block.type == "compress_time") {
+                    blocks["up_blocks." + std::to_string(block_idx)] = std::make_shared<DepthToSpaceUpsample>(channels,
+                                                                                                              2,
+                                                                                                              1,
+                                                                                                              block.multiplier,
+                                                                                                              false);
+                    channels /= block.multiplier;
+                } else if (block.type == "compress_space") {
+                    blocks["up_blocks." + std::to_string(block_idx)] = std::make_shared<DepthToSpaceUpsample>(channels,
+                                                                                                              1,
+                                                                                                              2,
+                                                                                                              block.multiplier,
+                                                                                                              false);
+                    channels /= block.multiplier;
+                } else {
+                    GGML_ABORT("Unsupported LTX VAE decoder block");
+                }
+            }
+
+            hidden_channels         = channels;
+            blocks["conv_norm_out"] = std::make_shared<PixelNorm3D>();
+            blocks["conv_out"]      = std::make_shared<CausalConv3d>(hidden_channels, 3 * patch_size * patch_size, 3);
+            if (timestep_conditioning) {
+                blocks["last_time_embedder"] = std::make_shared<PixArtAlphaCombinedTimestepSizeEmbeddings>(hidden_channels * 2);
+            }
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timestep) {
+            auto conv_in       = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv_in"]);
+            auto conv_norm_out = std::dynamic_pointer_cast<PixelNorm3D>(blocks["conv_norm_out"]);
+            auto conv_out      = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv_out"]);
+
+            ggml_tensor* scaled_timestep = timestep;
+            if (timestep_conditioning) {
+                auto multiplier = ggml_ext_backend_tensor_get_f32(params["timestep_scale_multiplier"]);
+                scaled_timestep = ggml_ext_scale(ctx->ggml_ctx, timestep, multiplier);
+            }
+
+            x = conv_in->forward(ctx, x, causal_decoder);
+
+            int block_idx = 0;
+            while (blocks.find("up_blocks." + std::to_string(block_idx)) != blocks.end()) {
+                auto mid_block = std::dynamic_pointer_cast<UNetMidBlock3D>(blocks["up_blocks." + std::to_string(block_idx)]);
+                if (mid_block) {
+                    x = mid_block->forward(ctx, x, scaled_timestep, causal_decoder);
+                } else {
+                    auto upsample = std::dynamic_pointer_cast<DepthToSpaceUpsample>(blocks["up_blocks." + std::to_string(block_idx)]);
+                    x             = upsample->forward(ctx, x, causal_decoder);
+                }
+                block_idx++;
+            }
+
+            x = conv_norm_out->forward(ctx, x);
+            if (timestep_conditioning) {
+                auto last_time_embedder = std::dynamic_pointer_cast<PixArtAlphaCombinedTimestepSizeEmbeddings>(blocks["last_time_embedder"]);
+                auto timestep_embed     = last_time_embedder->forward(ctx, scaled_timestep);
+                auto [shift, scale]     = get_shift_scale(ctx->ggml_ctx,
+                                                          params["last_scale_shift_table"],
+                                                          timestep_embed,
+                                                          hidden_channels,
+                                                          2);
+                x                       = apply_scale_shift(ctx->ggml_ctx, x, scale, shift);
+            }
+            x = ggml_silu_inplace(ctx->ggml_ctx, x);
+            x = conv_out->forward(ctx, x, causal_decoder);
+            return x;
+        }
+    };
+
+    struct VideoVAE : public GGMLBlock {
+        int version;
+        float decode_timestep;
+        bool timestep_conditioning;
+        int patch_size;
+        bool decode_only;
+
+        VideoVAE(int version,
+                 bool decode_only,
+                 bool timestep_conditioning,
+                 int patch_size,
+                 const String2TensorStorage& tensor_storage_map,
+                 const std::string& prefix,
+                 float decode_timestep = 0.05f)
+            : version(version),
+              decode_timestep(decode_timestep),
+              timestep_conditioning(timestep_conditioning),
+              patch_size(patch_size),
+              decode_only(decode_only) {
+            if (!decode_only) {
+                blocks["encoder"] = std::make_shared<Encoder>(version, patch_size);
+            }
+            blocks["decoder"]                = std::make_shared<Decoder>(version,
+                                                          tensor_storage_map,
+                                                          prefix,
+                                                          patch_size,
+                                                          false,
+                                                          timestep_conditioning);
+            blocks["per_channel_statistics"] = std::make_shared<PerChannelStatistics>();
+        }
+
+        ggml_tensor* decode(GGMLRunnerContext* ctx,
+                            ggml_tensor* z,
+                            ggml_tensor* timestep) {
+            auto decoder   = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]);
+            auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
+            auto latents   = processor->un_normalize(ctx, z);
+            auto out       = decoder->forward(ctx, latents, timestep);
+            out          = WAN::WanVAE::unpatchify(ctx->ggml_ctx, out, patch_size, 1);
+            return out;
+        }
+
+        ggml_tensor* encode(GGMLRunnerContext* ctx,
+                            ggml_tensor* x) {
+            GGML_ASSERT(!decode_only);
+            auto encoder   = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);
+            auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
+
+            x         = patchify(ctx->ggml_ctx, x, patch_size);
+            auto out  = encoder->forward(ctx, x);
+            auto mean = ggml_ext_chunk(ctx->ggml_ctx, out, 2, 3, false)[0];
+            mean      = ggml_cont(ctx->ggml_ctx, mean);
+            return processor->normalize(ctx, mean);
+        }
+    };
+
+}  // namespace LTXVAE
+
+struct LTXVideoVAE : public VAE {
+    bool decode_only;
+    int ltx_vae_version;
+    bool timestep_conditioning;
+    int patch_size;
+    sd::Tensor<float> decode_timestep_tensor;
+    LTXVAE::VideoVAE vae;
+
+    LTXVideoVAE(ggml_backend_t backend,
+                bool offload_params_to_cpu,
+                const String2TensorStorage& tensor_storage_map,
+                const std::string& prefix,
+                bool decode_only  = true,
+                SDVersion version = VERSION_LTXAV)
+        : decode_only(decode_only),
+          ltx_vae_version(LTXVAE::detect_ltx_vae_version(tensor_storage_map, prefix)),
+          timestep_conditioning(LTXVAE::detect_ltx_vae_timestep_conditioning(tensor_storage_map, prefix)),
+          patch_size(4),
+          decode_timestep_tensor(sd::Tensor<float>::from_vector({0.05f})),
+          vae(LTXVAE::detect_ltx_vae_version(tensor_storage_map, prefix),
+              decode_only,
+              LTXVAE::detect_ltx_vae_timestep_conditioning(tensor_storage_map, prefix),
+              patch_size,
+              tensor_storage_map,
+              prefix),
+          VAE(version, backend, offload_params_to_cpu) {
+        vae.init(params_ctx, tensor_storage_map, prefix);
+        decode_timestep_tensor.values()[0] = vae.decode_timestep;
+    }
+
+    std::string get_desc() override {
+        return "ltx_video_vae";
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {
+        vae.get_param_tensors(tensors, prefix);
+    }
+
+    ggml_cgraph* build_graph(const sd::Tensor<float>& z_tensor, bool decode_graph) {
+        LOG_DEBUG("ltx_video_vae build_graph input %dx%dx%dx%d",
+                  (int)z_tensor.shape()[0],
+                  (int)z_tensor.shape()[1],
+                  (int)z_tensor.shape()[2],
+                  (int)z_tensor.shape()[3]);
+        ggml_cgraph* gf       = ggml_new_graph(compute_ctx);
+        ggml_tensor* z        = make_input(z_tensor);
+        ggml_tensor* timestep = nullptr;
+        if (timestep_conditioning) {
+            timestep = make_input(decode_timestep_tensor);
+        }
+
+        auto runner_ctx  = get_context();
+        ggml_tensor* out = decode_graph ? vae.decode(&runner_ctx, z, timestep) : vae.encode(&runner_ctx, z);
+        LOG_DEBUG("ltx_video_vae build_graph output ne=[%lld,%lld,%lld,%lld]",
+                  (long long)out->ne[0],
+                  (long long)out->ne[1],
+                  (long long)out->ne[2],
+                  (long long)out->ne[3]);
+        ggml_build_forward_expand(gf, out);
+
+        return gf;
+    }
+
+    sd::Tensor<float> _compute(const int n_threads,
+                               const sd::Tensor<float>& z,
+                               bool decode_graph) override {
+        if (!decode_graph && decode_only) {
+            LOG_ERROR("LTX video VAE encoder is not implemented yet");
+            return {};
+        }
+        sd::Tensor<float> input = z;
+        size_t expected_dim     = static_cast<size_t>(z.dim());
+        if (!decode_graph) {
+            if (input.dim() == 4) {
+                input        = input.unsqueeze(2);
+                expected_dim = 5;
+            } else if (input.dim() != 5) {
+                LOG_ERROR("LTX video VAE encoder expects 4D image or 5D video input, got dim=%lld",
+                          (long long)input.dim());
+                return {};
+            }
+
+            int64_t cropped_t = std::max<int64_t>(1, 1 + ((input.shape()[2] - 1) / 8) * 8);
+            if (cropped_t != input.shape()[2]) {
+                input = sd::ops::slice(input, 2, 0, cropped_t);
+            }
+        }
+        auto get_graph = [&]() -> ggml_cgraph* {
+            return build_graph(input, decode_graph);
+        };
+        auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), expected_dim);
+        if (result.empty()) {
+            return {};
+        }
+        LOG_DEBUG("ltx_video_vae host output shape=[%lld,%lld,%lld,%lld] dim=%lld",
+                  (long long)(result.shape().size() > 0 ? result.shape()[0] : 0),
+                  (long long)(result.shape().size() > 1 ? result.shape()[1] : 0),
+                  (long long)(result.shape().size() > 2 ? result.shape()[2] : 0),
+                  (long long)(result.shape().size() > 3 ? result.shape()[3] : 0),
+                  (long long)result.dim());
+        return result;
+    }
+
+    int get_encoder_output_channels(int input_channels) override {
+        SD_UNUSED(input_channels);
+        return 256;
+    }
+
+    sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
+        SD_UNUSED(rng);
+        if (vae_output.dim() >= 4 && vae_output.shape()[3] > 128) {
+            return sd::ops::slice(vae_output, 3, 0, 128);
+        }
+        return vae_output;
+    }
+
+    sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
+        return latents;
+    }
+
+    sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
+        return latents;
+    }
+
+    void test(const std::string& input_path) {
+        auto z = sd::load_tensor_from_file_as_tensor<float>(input_path);
+        print_sd_tensor(z, false, "ltx_vae_z");
+
+        z = diffusion_to_vae_latents(z);
+
+        int64_t t0 = ggml_time_ms();
+        auto out   = _compute(8, z, true);
+        int64_t t1 = ggml_time_ms();
+
+        GGML_ASSERT(!out.empty());
+        print_sd_tensor(out, false, "ltx_vae_out");
+        LOG_DEBUG("ltx vae test done in %lldms", t1 - t0);
+    }
+
+    static void load_from_file_and_test(const std::string& model_path,
+                                        const std::string& input_path) {
+        ggml_backend_t backend = ggml_backend_cuda_init(0);
+        LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
+
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file_and_convert_name(model_path, "vae.")) {
+            LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
+            return;
+        }
+
+        auto& tensor_storage_map         = model_loader.get_tensor_storage_map();
+        std::shared_ptr<LTXVideoVAE> vae = std::make_shared<LTXVideoVAE>(backend,
+                                                                         false,
+                                                                         tensor_storage_map,
+                                                                         "first_stage_model",
+                                                                         true,
+                                                                         VERSION_LTXAV);
+
+        vae->alloc_params_buffer();
+        std::map<std::string, ggml_tensor*> tensors;
+        vae->get_param_tensors(tensors, "first_stage_model");
+
+        if (!model_loader.load_tensors(tensors)) {
+            LOG_ERROR("load tensors from model loader failed");
+            return;
+        }
+
+        LOG_INFO("ltx vae model loaded");
+        vae->test(input_path);
+    }
+};
+
+#endif  // __SD_LTX_VAE_H__
--- a/src/ltx_vae_test.cpp
+++ b/src/ltx_vae_test.cpp
@ -0,0 +1,8 @@
+#include "ltx_vae_test.h"
+
+#include "ltx_vae.h"
+
+void ltx_vae_load_from_file_and_test(const std::string& model_path,
+                                     const std::string& input_path) {
+    LTXVideoVAE::load_from_file_and_test(model_path, input_path);
+}
--- a/src/ltx_vae_test.h
+++ b/src/ltx_vae_test.h
@ -0,0 +1,9 @@
+#ifndef __SD_LTX_VAE_TEST_H__
+#define __SD_LTX_VAE_TEST_H__
+
+#include <string>
+
+void ltx_vae_load_from_file_and_test(const std::string& model_path,
+                                     const std::string& input_path);
+
+#endif  // __SD_LTX_VAE_TEST_H__
--- a/src/ltxv.hpp
+++ b/src/ltxv.hpp
--- a/src/model.cpp
+++ b/src/model.cpp
@ -471,6 +471,9 @@ SDVersion ModelLoader::get_sd_version() {
        if (tensor_storage.name.find("model.diffusion_model.layers.0.adaLN_sa_ln.weight") != std::string::npos) {
            return VERSION_ERNIE_IMAGE;
        }
+        if (tensor_storage.name.find("model.diffusion_model.adaln_single.emb.timestep_embedder.linear_1.bias") != std::string::npos) {
+            return VERSION_LTXAV;
+        }
        if (tensor_storage.name.find("model.diffusion_model.blocks.0.cross_attn.norm_k.weight") != std::string::npos) {
            is_wan = true;
        }
--- a/src/model.h
+++ b/src/model.h
@ -42,6 +42,7 @@ enum SDVersion {
    VERSION_ANIMA,
    VERSION_FLUX2,
    VERSION_FLUX2_KLEIN,
+    VERSION_LTXAV,
    VERSION_Z_IMAGE,
    VERSION_OVIS_IMAGE,
    VERSION_ERNIE_IMAGE,
@ -104,6 +105,13 @@ static inline bool sd_version_is_flux2(SDVersion version) {
    return false;
 }

+static inline bool sd_version_is_ltxav(SDVersion version) {
+    if (version == VERSION_LTXAV) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_wan(SDVersion version) {
    if (version == VERSION_WAN2 || version == VERSION_WAN2_2_I2V || version == VERSION_WAN2_2_TI2V) {
        return true;
@ -160,6 +168,7 @@ static inline bool sd_version_is_inpaint(SDVersion version) {
 static inline bool sd_version_is_dit(SDVersion version) {
    if (sd_version_is_flux(version) ||
        sd_version_is_flux2(version) ||
+        sd_version_is_ltxav(version) ||
        sd_version_is_sd3(version) ||
        sd_version_is_wan(version) ||
        sd_version_is_qwen_image(version) ||
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -14,6 +14,7 @@
 #include "diffusion_model.hpp"
 #include "esrgan.hpp"
 #include "lora.hpp"
+#include "ltx_vae.h"
 #include "pmid.hpp"
 #include "sample-cache.h"
 #include "tae.hpp"
@ -52,6 +53,7 @@ const char* model_version_to_str[] = {
    "Anima",
    "Flux.2",
    "Flux.2 klein",
+    "LTXAV",
    "Z-Image",
    "Ovis Image",
    "Ernie Image",
@ -351,6 +353,17 @@ public:
            return false;
        }

+        if (strlen(SAFE_STR(sd_ctx_params->embeddings_connectors_path)) > 0) {
+            if (sd_version_is_ltxav(version)) {
+                LOG_INFO("loading embeddings connectors from '%s'", sd_ctx_params->embeddings_connectors_path);
+                if (!model_loader.init_from_file(sd_ctx_params->embeddings_connectors_path)) {
+                    LOG_WARN("loading embeddings connectors from '%s' failed", sd_ctx_params->embeddings_connectors_path);
+                }
+            } else {
+                LOG_WARN("ignoring embeddings connectors for non-LTXAV model: '%s'", sd_ctx_params->embeddings_connectors_path);
+            }
+        }
+
        auto& tensor_storage_map = model_loader.get_tensor_storage_map();

        LOG_INFO("Version: %s ", model_version_to_str[version]);
@ -415,6 +428,9 @@ public:
            // Might need vae encode for control cond
            vae_decode_only = false;
        }
+        if (sd_version_is_ltxav(version)) {
+            vae_decode_only = true;
+        }

        bool tae_preview_only = sd_ctx_params->tae_preview_only;
        if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
@ -492,6 +508,14 @@ public:
                                                              tensor_storage_map,
                                                              version,
                                                              sd_ctx_params->chroma_use_dit_mask);
+            } else if (sd_version_is_ltxav(version)) {
+                cond_stage_model = std::make_shared<LTXAVEmbedder>(clip_backend,
+                                                                   offload_params_to_cpu,
+                                                                   tensor_storage_map);
+                diffusion_model  = std::make_shared<LTXAVModel>(backend,
+                                                               offload_params_to_cpu,
+                                                               tensor_storage_map,
+                                                               "model.diffusion_model");
            } else if (sd_version_is_wan(version)) {
                cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                    offload_params_to_cpu,
@ -638,7 +662,14 @@ public:
            };

            auto create_vae = [&]() -> std::shared_ptr<VAE> {
-                if (sd_version_is_wan(version) ||
+                if (sd_version_is_ltxav(version)) {
+                    return std::make_shared<LTXVideoVAE>(vae_backend,
+                                                         offload_params_to_cpu,
+                                                         tensor_storage_map,
+                                                         "first_stage_model",
+                                                         true,
+                                                         version);
+                } else if (sd_version_is_wan(version) ||
                           sd_version_is_qwen_image(version) ||
                           sd_version_is_anima(version)) {
                    return std::make_shared<WAN::WanVAERunner>(vae_backend,
@ -936,13 +967,16 @@ public:
                        pred_type = EPS_PRED;
                    }
                } else if (sd_version_is_sd3(version) ||
+                           sd_version_is_ltxav(version) ||
                           sd_version_is_wan(version) ||
                           sd_version_is_qwen_image(version) ||
                           sd_version_is_anima(version) ||
                           sd_version_is_ernie_image(version) ||
                           sd_version_is_z_image(version)) {
                    pred_type = FLOW_PRED;
-                    if (sd_version_is_wan(version)) {
+                    if (sd_version_is_ltxav(version)) {
+                        default_flow_shift = 2.37f;
+                    } else if (sd_version_is_wan(version)) {
                        default_flow_shift = 5.f;
                    } else if (sd_version_is_ernie_image(version)) {
                        default_flow_shift = 4.f;
@ -979,8 +1013,13 @@ public:
                    denoiser = std::make_shared<EDMVDenoiser>();
                    break;
                case FLOW_PRED: {
+                    if (sd_version_is_ltxav(version)) {
+                        LOG_INFO("running in LTXAV FLOW mode");
+                        denoiser = std::make_shared<FluxFlowDenoiser>();
+                    } else {
                        LOG_INFO("running in FLOW mode");
                        denoiser = std::make_shared<DiscreteFlowDenoiser>();
+                    }
                    break;
                }
                case FLUX_FLOW_PRED: {
@ -1621,6 +1660,7 @@ public:
                             const sd::Tensor<float>& denoise_mask,
                             const sd::Tensor<float>& vace_context,
                             float vace_strength,
+                             int audio_length,
                             const sd_cache_params_t* cache_params) {
        std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
        float cfg_scale     = guidance.txt_cfg;
@ -1699,6 +1739,7 @@ public:
            diffusion_params.control_strength   = control_strength;
            diffusion_params.vace_context       = vace_context.empty() ? nullptr : &vace_context;
            diffusion_params.vace_strength      = vace_strength;
+            diffusion_params.audio_length       = audio_length;
            diffusion_params.skip_layers        = nullptr;

            compute_sample_controls(control_image,
@ -1860,7 +1901,9 @@ public:
    int get_latent_channel() {
        int latent_channel = 4;
        if (sd_version_is_dit(version)) {
-            if (version == VERSION_WAN2_2_TI2V) {
+            if (sd_version_is_ltxav(version)) {
+                latent_channel = 128;
+            } else if (version == VERSION_WAN2_2_TI2V) {
                latent_channel = 48;
            } else if (version == VERSION_CHROMA_RADIANCE) {
                latent_channel = 3;
@ -1886,7 +1929,9 @@ public:
        int W                = width / vae_scale_factor;
        int H                = height / vae_scale_factor;
        int T                = frames;
-        if (sd_version_is_wan(version)) {
+        if (sd_version_is_ltxav(version)) {
+            T = ((T - 1) / 8) + 1;
+        } else if (sd_version_is_wan(version)) {
            T = ((T - 1) / 4) + 1;
        }
        int C = get_latent_channel();
@ -2223,6 +2268,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
             "llm_vision_path: %s\n"
             "diffusion_model_path: %s\n"
             "high_noise_diffusion_model_path: %s\n"
+             "embeddings_connectors_path: %s\n"
             "vae_path: %s\n"
             "taesd_path: %s\n"
             "control_net_path: %s\n"
@ -2255,6 +2301,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
             SAFE_STR(sd_ctx_params->llm_vision_path),
             SAFE_STR(sd_ctx_params->diffusion_model_path),
             SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
+             SAFE_STR(sd_ctx_params->embeddings_connectors_path),
             SAFE_STR(sd_ctx_params->vae_path),
             SAFE_STR(sd_ctx_params->taesd_path),
             SAFE_STR(sd_ctx_params->control_net_path),
@ -2433,6 +2480,7 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
    sd_vid_gen_params->strength                              = 0.75f;
    sd_vid_gen_params->seed                                  = -1;
    sd_vid_gen_params->video_frames                          = 6;
+    sd_vid_gen_params->fps                                   = 16;
    sd_vid_gen_params->moe_boundary                          = 0.875f;
    sd_vid_gen_params->vace_strength                         = 1.f;
    sd_vid_gen_params->vae_tiling_params                     = {false, 0, 0, 0.5f, 0.0f, 0.0f};
@ -2444,7 +2492,7 @@ struct sd_ctx_t {
 };

 static bool sd_version_supports_video_generation(SDVersion version) {
-    return version == VERSION_SVD || sd_version_is_wan(version);
+    return version == VERSION_SVD || sd_version_is_wan(version) || sd_version_is_ltxav(version);
 }

 static bool sd_version_supports_image_generation(SDVersion version) {
@ -2589,6 +2637,8 @@ struct GenerationRequest {
    sd_pm_params_t pm_params                 = {};
    sd_hires_params_t hires                  = {};
    int frames                               = -1;
+    int requested_frames                     = -1;
+    int fps                                  = 16;
    float vace_strength                      = 1.f;

    GenerationRequest(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
@ -2619,8 +2669,14 @@ struct GenerationRequest {
        negative_prompt  = SAFE_STR(sd_vid_gen_params->negative_prompt);
        width            = sd_vid_gen_params->width;
        height           = sd_vid_gen_params->height;
-        frames                      = (sd_vid_gen_params->video_frames - 1) / 4 * 4 + 1;
+        requested_frames = std::max(1, sd_vid_gen_params->video_frames);
+        if (sd_version_is_ltxav(sd_ctx->sd->version)) {
+            frames = ((requested_frames - 1 + 7) / 8) * 8 + 1;
+        } else {
+            frames = (requested_frames - 1) / 4 * 4 + 1;
+        }
        clip_skip                   = sd_vid_gen_params->clip_skip;
+        fps                         = std::max(1, sd_vid_gen_params->fps);
        vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
        diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
        seed                        = sd_vid_gen_params->seed;
@ -2629,6 +2685,12 @@ struct GenerationRequest {
        guidance                    = sd_vid_gen_params->sample_params.guidance;
        high_noise_guidance         = sd_vid_gen_params->high_noise_sample_params.guidance;
        resolve(sd_ctx);
+        if (frames != requested_frames) {
+            LOG_WARN("align video frames from %d to %d for %s",
+                     requested_frames,
+                     frames,
+                     model_version_to_str[sd_ctx->sd->version]);
+        }
    }

    void align_generation_request_size() {
@ -2858,6 +2920,7 @@ struct ImageGenerationLatents {
    sd::Tensor<float> init_latent;
    sd::Tensor<float> concat_latent;
    sd::Tensor<float> uncond_concat_latent;
+    sd::Tensor<float> audio_latent;
    sd::Tensor<float> control_image;
    std::vector<sd::Tensor<float>> ref_images;
    std::vector<sd::Tensor<float>> ref_latents;
@ -2865,8 +2928,51 @@ struct ImageGenerationLatents {
    sd::Tensor<float> clip_vision_output;
    sd::Tensor<float> vace_context;
    int64_t ref_image_num = 0;
+    int audio_length      = 0;
 };

+static sd::Tensor<float> pack_ltxav_audio_and_video_latents(const sd::Tensor<float>& video_latent,
+                                                            const sd::Tensor<float>& audio_latent) {
+    if (audio_latent.empty()) {
+        return video_latent;
+    }
+
+    GGML_ASSERT(video_latent.dim() == 4 || video_latent.dim() == 5);
+    GGML_ASSERT(audio_latent.dim() == 3 || audio_latent.dim() == 4);
+    if (video_latent.dim() == 5) {
+        GGML_ASSERT(video_latent.shape()[4] == 1);
+    }
+    if (audio_latent.dim() == 4) {
+        GGML_ASSERT(audio_latent.shape()[3] == 1);
+    }
+
+    int64_t width        = video_latent.shape()[0];
+    int64_t height       = video_latent.shape()[1];
+    int64_t frames       = video_latent.shape()[2];
+    int64_t video_ch     = video_latent.shape()[3];
+    int64_t spatial_size = width * height * frames;
+    int64_t audio_values = audio_latent.numel();
+    int64_t extra_ch     = (audio_values + spatial_size - 1) / spatial_size;
+
+    std::vector<int64_t> packed_shape = video_latent.shape();
+    packed_shape[3]                   = video_ch + extra_ch;
+    sd::Tensor<float> packed          = sd::zeros<float>(packed_shape);
+
+    std::copy_n(video_latent.data(), video_latent.numel(), packed.data());
+    std::copy_n(audio_latent.data(), audio_latent.numel(), packed.data() + video_latent.numel());
+    return packed;
+}
+
+static int get_ltxav_num_audio_latents(int frames, int fps) {
+    GGML_ASSERT(frames > 0);
+    GGML_ASSERT(fps > 0);
+    constexpr float kSampleRate                 = 16000.0f;
+    constexpr float kMelHopLength               = 160.0f;
+    constexpr float kAudioLatentDownsample      = 4.0f;
+    constexpr float kLatentsPerSecond           = kSampleRate / kMelHopLength / kAudioLatentDownsample;
+    return static_cast<int>(std::ceil((static_cast<float>(frames) / static_cast<float>(fps)) * kLatentsPerSecond));
+}
+
 struct ImageGenerationEmbeds {
    SDCondition cond;
    SDCondition uncond;
@ -3454,6 +3560,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
                                                   latents.denoise_mask,
                                                   sd::Tensor<float>(),
                                                   1.f,
+                                                   0,
                                                   request.cache_params);
        int64_t sampling_end  = ggml_time_ms();
        if (!x_0.empty()) {
@ -3575,6 +3682,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
                                                            hires_denoise_mask,
                                                            sd::Tensor<float>(),
                                                            1.f,
+                                                            0,
                                                            request.cache_params);
            int64_t hires_sample_end   = ggml_time_ms();
            if (!x_0.empty()) {
@ -3633,6 +3741,18 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
        end_image = sd_image_to_tensor(sd_vid_gen_params->end_image, request->width, request->height);
    }

+    if (sd_version_is_ltxav(sd_ctx->sd->version)) {
+        latents.audio_length = get_ltxav_num_audio_latents(request->frames, request->fps);
+        latents.audio_latent = sd::zeros<float>({16, latents.audio_length, 8, 1});
+    }
+
+    if (sd_version_is_ltxav(sd_ctx->sd->version)) {
+        if (!start_image.empty() || !end_image.empty() || sd_vid_gen_params->control_frames_size > 0) {
+            LOG_ERROR("LTXAV currently supports txt2vid only; init_image, end_image, and control_frames are not implemented");
+            return std::nullopt;
+        }
+    }
+
    if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
        sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-I2V-14B" ||
        sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" ||
@ -3803,6 +3923,10 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
        latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
    }

+    if (!latents.audio_latent.empty()) {
+        latents.init_latent = pack_ltxav_audio_and_video_latents(latents.init_latent, latents.audio_latent);
+    }
+
    return latents;
 }

@ -3839,14 +3963,26 @@ static ImageGenerationEmbeds prepare_video_generation_embeds(sd_ctx_t* sd_ctx,
 }

 static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
+                                        const GenerationRequest& request,
                                        const sd::Tensor<float>& final_latent,
                                        int* num_frames_out) {
    if (final_latent.empty()) {
        LOG_ERROR("no latent video to decode");
        return nullptr;
    }
+    sd::Tensor<float> video_latent = final_latent;
+    if (sd_version_is_ltxav(sd_ctx->sd->version) &&
+        video_latent.shape()[3] > sd_ctx->sd->get_latent_channel()) {
+        video_latent = sd::ops::slice(video_latent, 3, 0, sd_ctx->sd->get_latent_channel());
+    }
+    LOG_DEBUG("decode_video_outputs latent %dx%dx%dx%d",
+              (int)video_latent.shape()[0],
+              (int)video_latent.shape()[1],
+              (int)video_latent.shape()[2],
+              (int)video_latent.shape()[3]);
+    // auto z = sd::load_tensor_from_file_as_tensor<float>("ltx_vae_z.bin");
    int64_t t4            = ggml_time_ms();
-    sd::Tensor<float> vid = sd_ctx->sd->decode_first_stage(final_latent, true);
+    sd::Tensor<float> vid = sd_ctx->sd->decode_first_stage(video_latent, true);
    int64_t t5            = ggml_time_ms();
    LOG_INFO("decode_first_stage completed, taking %.2fs", (t5 - t4) * 1.0f / 1000);
    if (sd_ctx->sd->free_params_immediately) {
@ -3856,6 +3992,15 @@ static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
        LOG_ERROR("decode_first_stage failed for video");
        return nullptr;
    }
+    LOG_DEBUG("decode_video_outputs decoded %dx%dx%dx%d",
+              (int)vid.shape()[0],
+              (int)vid.shape()[1],
+              (int)vid.shape()[2],
+              (int)vid.shape()[3]);
+    if (request.requested_frames > 0 &&
+        vid.shape()[2] > request.requested_frames) {
+        vid = sd::ops::slice(vid, 2, 0, request.requested_frames);
+    }

    sd_image_t* result_images = (sd_image_t*)calloc(vid.shape()[2], sizeof(sd_image_t));
    if (result_images == nullptr) {
@ -3939,6 +4084,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                                                           latents.denoise_mask,
                                                           latents.vace_context,
                                                           request.vace_strength,
+                                                           latents.audio_length,
                                                           request.cache_params);
        int64_t sampling_end          = ggml_time_ms();
        if (x_t_sampled.empty()) {
@ -3981,6 +4127,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                                                        latents.denoise_mask,
                                                        latents.vace_context,
                                                        request.vace_strength,
+                                                        latents.audio_length,
                                                        request.cache_params);

    int64_t sampling_end = ggml_time_ms();
@ -4000,7 +4147,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
    int64_t latent_end = ggml_time_ms();
    LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000);

-    auto result = decode_video_outputs(sd_ctx, final_latent, num_frames_out);
+    auto result = decode_video_outputs(sd_ctx, request, final_latent, num_frames_out);
    if (result == nullptr) {
        return nullptr;
    }
--- a/src/tae.hpp
+++ b/src/tae.hpp
@ -2,7 +2,6 @@
 #define __TAE_HPP__

 #include "ggml_extend.hpp"
-
 #include "model.h"

 /*
--- a/src/tensor_ggml.hpp
+++ b/src/tensor_ggml.hpp
@ -104,7 +104,7 @@ namespace sd {
            throw std::invalid_argument("tensor file type does not match requested sd::Tensor type");
        }

-        std::vector<int64_t> shape(4, 1);
+        std::vector<int64_t> shape(n_dims, 1);
        for (int i = 0; i < n_dims; ++i) {
            int32_t dim = 1;
            file.read(reinterpret_cast<char*>(&dim), sizeof(dim));
--- a/src/tokenizers/gemma_tokenizer.cpp
+++ b/src/tokenizers/gemma_tokenizer.cpp
@ -50,6 +50,7 @@ GemmaTokenizer::GemmaTokenizer(const std::string& merges_utf8_str, const std::st
    byte_level_bpe = false;
    byte_fallback  = true;
    add_bos_token  = true;
+    pad_left       = true;
    PAD_TOKEN      = "<pad>";
    EOS_TOKEN      = "<eos>";
    BOS_TOKEN      = "<bos>";
--- a/src/vae.hpp
+++ b/src/vae.hpp
@ -67,7 +67,9 @@ public:

    int get_scale_factor() {
        int scale_factor = 8;
-        if (version == VERSION_WAN2_2_TI2V) {
+        if (version == VERSION_LTXAV) {
+            scale_factor = 32;
+        } else if (version == VERSION_WAN2_2_TI2V) {
            scale_factor = 16;
        } else if (sd_version_uses_flux2_vae(version)) {
            scale_factor = 16;
--- a/src/wan.hpp
+++ b/src/wan.hpp
@ -966,7 +966,7 @@ namespace WAN {
            blocks["conv2"]   = std::shared_ptr<GGMLBlock>(new CausalConv3d(z_dim, z_dim, {1, 1, 1}));
        }

-        ggml_tensor* patchify(ggml_context* ctx,
+        static ggml_tensor* patchify(ggml_context* ctx,
                                     ggml_tensor* x,
                                     int64_t patch_size,
                                     int64_t b = 1) {
@ -993,7 +993,7 @@ namespace WAN {
            return x;
        }

-        ggml_tensor* unpatchify(ggml_context* ctx,
+        static ggml_tensor* unpatchify(ggml_context* ctx,
                                       ggml_tensor* x,
                                       int64_t patch_size,
                                       int64_t b = 1) {