feat: add ltx2.3 flf2v support (#1505)

2026-06-09 15:56:39 +00:00 · 2026-05-17 18:40:14 +08:00 · 2026-05-17 18:40:14 +08:00 · e43b24cf48
commit e43b24cf48
parent 06accf2b39
5 changed files with 338 additions and 52 deletions
--- a/assets/ltx2/flf2v.webm
+++ b/assets/ltx2/flf2v.webm
--- a/docs/ltx2.md
+++ b/docs/ltx2.md
@ -38,4 +38,16 @@
  src="../assets/ltx2/i2v.webm"
  controls
  muted
+  style="max-width: 100%; height: auto;"></video>
+
+### LTX-2.3 dev FLF2V
+
+```
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors  -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v  -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png -o flf2v.webm
+```
+
+<video
+  src="../assets/ltx2/flf2v.webm"
+  controls
+  muted
  style="max-width: 100%; height: auto;"></video>
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@ -40,6 +40,7 @@ struct DiffusionParams {
    float vace_strength                                                = 1.f;
    int audio_length                                                   = 0;
    float frame_rate                                                   = 24.f;
+    const sd::Tensor<float>* video_positions                           = nullptr;
    const std::vector<int>* skip_layers                                = nullptr;
 };

@ -766,7 +767,8 @@ struct LTXAVModel : public DiffusionModel {
                             tensor_or_empty(diffusion_params.audio_x),
                             tensor_or_empty(diffusion_params.audio_timesteps),
                             diffusion_params.audio_length,
-                             diffusion_params.frame_rate);
+                             diffusion_params.frame_rate,
+                             tensor_or_empty(diffusion_params.video_positions));
    }
 };

--- a/src/ltxv.hpp
+++ b/src/ltxv.hpp
@ -243,6 +243,56 @@ namespace LTXV {
        return build_rope_matrix_from_frequencies(freqs, dim);
    }

+    __STATIC_INLINE__ std::vector<float> build_video_rope_matrix_from_positions(const sd::Tensor<float>& positions,
+                                                                                int dim,
+                                                                                int num_heads,
+                                                                                float theta,
+                                                                                const std::vector<int>& max_pos,
+                                                                                bool use_middle_indices_grid) {
+        GGML_ASSERT(max_pos.size() == 3);
+        GGML_ASSERT(dim % num_heads == 0);
+        GGML_ASSERT(positions.dim() == 3 || positions.dim() == 4);
+        GGML_ASSERT(positions.shape()[0] == 2);
+        GGML_ASSERT(positions.shape()[1] == 3);
+        if (positions.dim() == 4) {
+            GGML_ASSERT(positions.shape()[3] == 1);
+        }
+
+        const int64_t tokens             = positions.shape()[2];
+        const std::vector<float> indices = generate_freq_grid(theta, 3, dim);
+        const int half_dim               = dim / 2;
+        const int pad_size               = half_dim - static_cast<int>(indices.size()) * 3;
+        std::vector<std::vector<float>> freqs(static_cast<size_t>(tokens), std::vector<float>(half_dim, 0.f));
+
+        for (int64_t token = 0; token < tokens; token++) {
+            int out_idx = 0;
+            for (int i = 0; i < pad_size; i++) {
+                freqs[token][out_idx++] = 0.f;
+            }
+
+            float coords[3];
+            for (int axis = 0; axis < 3; axis++) {
+                float start  = positions.dim() == 4 ? positions.index(0, axis, token, 0)
+                                                    : positions.index(0, axis, token);
+                float end    = positions.dim() == 4 ? positions.index(1, axis, token, 0)
+                                                    : positions.index(1, axis, token);
+                float coord  = use_middle_indices_grid ? 0.5f * (start + end) : start;
+                coords[axis] = coord / static_cast<float>(max_pos[axis]);
+            }
+
+            for (float index : indices) {
+                for (int axis = 0; axis < 3; axis++) {
+                    freqs[token][out_idx++] = index * (coords[axis] * 2.f - 1.f);
+                }
+            }
+        }
+
+        if (num_heads > 1) {
+            return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
+        }
+        return build_rope_matrix_from_frequencies(freqs, dim);
+    }
+
    __STATIC_INLINE__ std::vector<float> build_1d_rope_matrix(int64_t seq_len,
                                                              int dim,
                                                              int num_heads          = 1,
@ -848,6 +898,31 @@ namespace LTXV {
        return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
    }

+    __STATIC_INLINE__ std::vector<float> build_video_temporal_rope_matrix_from_positions(const sd::Tensor<float>& positions,
+                                                                                         int dim,
+                                                                                         int num_heads,
+                                                                                         float theta,
+                                                                                         int max_pos_t,
+                                                                                         bool use_middle_indices_grid) {
+        GGML_ASSERT(positions.dim() == 3 || positions.dim() == 4);
+        GGML_ASSERT(positions.shape()[0] == 2);
+        GGML_ASSERT(positions.shape()[1] >= 1);
+        if (positions.dim() == 4) {
+            GGML_ASSERT(positions.shape()[3] == 1);
+        }
+
+        std::vector<float> coords;
+        coords.reserve(static_cast<size_t>(positions.shape()[2]));
+        for (int64_t token = 0; token < positions.shape()[2]; token++) {
+            float start = positions.dim() == 4 ? positions.index(0, 0, token, 0)
+                                               : positions.index(0, 0, token);
+            float end   = positions.dim() == 4 ? positions.index(1, 0, token, 0)
+                                               : positions.index(1, 0, token);
+            coords.push_back(use_middle_indices_grid ? 0.5f * (start + end) : start);
+        }
+        return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
+    }
+
    __STATIC_INLINE__ float audio_latent_start_time_sec(int64_t latent_index,
                                                        int audio_latent_downsample_factor = 4,
                                                        int hop_length                     = 160,
@ -1664,7 +1739,8 @@ namespace LTXV {
                                 const sd::Tensor<float>& audio_x_tensor         = {},
                                 const sd::Tensor<float>& audio_timesteps_tensor = {},
                                 int audio_length                                = 0,
-                                 float frame_rate                                = 24.f) {
+                                 float frame_rate                                = 24.f,
+                                 const sd::Tensor<float>& video_positions_tensor = {}) {
            auto split_inputs = split_av_latents(x_tensor, audio_length);
            vx_input_cache    = split_inputs.first;
            if (!audio_x_tensor.empty()) {
@ -1681,19 +1757,31 @@ namespace LTXV {

            ggml_cgraph* gf = new_graph_custom(LTXAV_GRAPH_SIZE);

-            float video_frame_rate = frame_rate > 0.f ? frame_rate : 24.f;
-            video_pe_vec           = build_video_rope_matrix(vx->ne[0],
-                                                             vx->ne[1],
-                                                             vx->ne[2],
-                                                             static_cast<int>(params.hidden_size),
-                                                             static_cast<int>(params.num_attention_heads),
-                                                             video_frame_rate,
-                                                             params.positional_embedding_theta,
-                                                             params.positional_embedding_max_pos,
-                                                             params.vae_scale_factors,
-                                                             params.causal_temporal_positioning,
-                                                             params.use_middle_indices_grid);
-            auto video_pe          = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.num_attention_heads);
+            float video_frame_rate    = frame_rate > 0.f ? frame_rate : 24.f;
+            int64_t video_token_count = vx->ne[0] * vx->ne[1] * vx->ne[2];
+            bool has_video_positions  = !video_positions_tensor.empty();
+            if (has_video_positions) {
+                GGML_ASSERT(video_positions_tensor.shape()[2] == video_token_count);
+                video_pe_vec = build_video_rope_matrix_from_positions(video_positions_tensor,
+                                                                      static_cast<int>(params.hidden_size),
+                                                                      static_cast<int>(params.num_attention_heads),
+                                                                      params.positional_embedding_theta,
+                                                                      params.positional_embedding_max_pos,
+                                                                      params.use_middle_indices_grid);
+            } else {
+                video_pe_vec = build_video_rope_matrix(vx->ne[0],
+                                                       vx->ne[1],
+                                                       vx->ne[2],
+                                                       static_cast<int>(params.hidden_size),
+                                                       static_cast<int>(params.num_attention_heads),
+                                                       video_frame_rate,
+                                                       params.positional_embedding_theta,
+                                                       params.positional_embedding_max_pos,
+                                                       params.vae_scale_factors,
+                                                       params.causal_temporal_positioning,
+                                                       params.use_middle_indices_grid);
+            }
+            auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, video_token_count * params.num_attention_heads);
            ggml_set_name(video_pe, "ltxav_video_pe");
            set_backend_tensor_data(video_pe, video_pe_vec.data());

@ -1712,18 +1800,27 @@ namespace LTXV {
                set_backend_tensor_data(audio_pe, audio_pe_vec.data());

                int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]);
-                video_cross_pe_vec   = build_video_temporal_rope_matrix(vx->ne[0],
-                                                                        vx->ne[1],
-                                                                        vx->ne[2],
-                                                                        static_cast<int>(params.audio_cross_attention_dim),
-                                                                        static_cast<int>(params.audio_num_attention_heads),
-                                                                        video_frame_rate,
-                                                                        params.positional_embedding_theta,
-                                                                        temporal_max_pos,
-                                                                        std::get<0>(params.vae_scale_factors),
-                                                                        params.causal_temporal_positioning,
-                                                                        true);
-                video_cross_pe       = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.audio_num_attention_heads);
+                if (has_video_positions) {
+                    video_cross_pe_vec = build_video_temporal_rope_matrix_from_positions(video_positions_tensor,
+                                                                                         static_cast<int>(params.audio_cross_attention_dim),
+                                                                                         static_cast<int>(params.audio_num_attention_heads),
+                                                                                         params.positional_embedding_theta,
+                                                                                         temporal_max_pos,
+                                                                                         true);
+                } else {
+                    video_cross_pe_vec = build_video_temporal_rope_matrix(vx->ne[0],
+                                                                          vx->ne[1],
+                                                                          vx->ne[2],
+                                                                          static_cast<int>(params.audio_cross_attention_dim),
+                                                                          static_cast<int>(params.audio_num_attention_heads),
+                                                                          video_frame_rate,
+                                                                          params.positional_embedding_theta,
+                                                                          temporal_max_pos,
+                                                                          std::get<0>(params.vae_scale_factors),
+                                                                          params.causal_temporal_positioning,
+                                                                          true);
+                }
+                video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, video_token_count * params.audio_num_attention_heads);
                ggml_set_name(video_cross_pe, "ltxav_video_cross_pe");
                set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data());

@ -1806,9 +1903,10 @@ namespace LTXV {
                                  const sd::Tensor<float>& audio_x         = {},
                                  const sd::Tensor<float>& audio_timesteps = {},
                                  int audio_length                         = 0,
-                                  float frame_rate                         = 24.f) {
+                                  float frame_rate                         = 24.f,
+                                  const sd::Tensor<float>& video_positions = {}) {
            auto get_graph = [&]() -> ggml_cgraph* {
-                return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length, frame_rate);
+                return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length, frame_rate, video_positions);
            };
            auto out = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
            return out;
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -1842,7 +1842,8 @@ public:
                             float vace_strength,
                             int audio_length,
                             float frame_rate,
-                             const sd_cache_params_t* cache_params) {
+                             const sd_cache_params_t* cache_params,
+                             const sd::Tensor<float>& video_positions = {}) {
        std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
        float cfg_scale     = guidance.txt_cfg;
        float img_cfg_scale = guidance.img_cfg;
@ -1948,6 +1949,7 @@ public:
            diffusion_params.vace_strength      = vace_strength;
            diffusion_params.audio_length       = audio_length;
            diffusion_params.frame_rate         = frame_rate;
+            diffusion_params.video_positions    = video_positions.empty() ? nullptr : &video_positions;
            diffusion_params.skip_layers        = nullptr;

            compute_sample_controls(control_image,
@ -3231,16 +3233,99 @@ struct ImageGenerationLatents {
    sd::Tensor<float> concat_latent;
    sd::Tensor<float> uncond_concat_latent;
    sd::Tensor<float> audio_latent;
+    sd::Tensor<float> video_positions;
    sd::Tensor<float> control_image;
    std::vector<sd::Tensor<float>> ref_images;
    std::vector<sd::Tensor<float>> ref_latents;
    sd::Tensor<float> denoise_mask;
    sd::Tensor<float> clip_vision_output;
    sd::Tensor<float> vace_context;
-    int64_t ref_image_num = 0;
-    int audio_length      = 0;
+    int64_t ref_image_num                  = 0;
+    int64_t video_conditioning_frame_count = 0;
+    int64_t video_target_frame_count       = 0;
+    int audio_length                       = 0;
 };

+static float ltxv_latent_corner_to_pixel_frame(int64_t corner_index,
+                                               int temporal_scale,
+                                               bool causal_temporal_positioning) {
+    float pixel_t = static_cast<float>(corner_index * temporal_scale);
+    if (causal_temporal_positioning) {
+        pixel_t = std::max(0.f, pixel_t + 1.f - static_cast<float>(temporal_scale));
+    }
+    return pixel_t;
+}
+
+static void set_ltxv_video_position(sd::Tensor<float>* positions,
+                                    int64_t token,
+                                    float t_start,
+                                    float t_end,
+                                    float h_start,
+                                    float h_end,
+                                    float w_start,
+                                    float w_end) {
+    positions->index(0, 0, token, 0) = t_start;
+    positions->index(1, 0, token, 0) = t_end;
+    positions->index(0, 1, token, 0) = h_start;
+    positions->index(1, 1, token, 0) = h_end;
+    positions->index(0, 2, token, 0) = w_start;
+    positions->index(1, 2, token, 0) = w_end;
+}
+
+static sd::Tensor<float> build_ltxv_video_positions(int64_t width,
+                                                    int64_t height,
+                                                    int64_t target_latent_frames,
+                                                    int64_t keyframe_latent_frames,
+                                                    int keyframe_frame_idx,
+                                                    int keyframe_pixel_frames,
+                                                    int fps,
+                                                    int spatial_scale,
+                                                    int temporal_scale,
+                                                    bool causal_temporal_positioning) {
+    GGML_ASSERT(width > 0 && height > 0 && target_latent_frames > 0);
+    GGML_ASSERT(keyframe_latent_frames > 0);
+    GGML_ASSERT(fps > 0);
+
+    int64_t total_tokens = width * height * (target_latent_frames + keyframe_latent_frames);
+    sd::Tensor<float> positions({2, 3, total_tokens, 1});
+    int64_t token = 0;
+
+    for (int64_t t = 0; t < target_latent_frames; t++) {
+        float t_start = ltxv_latent_corner_to_pixel_frame(t, temporal_scale, causal_temporal_positioning) / static_cast<float>(fps);
+        float t_end   = ltxv_latent_corner_to_pixel_frame(t + 1, temporal_scale, causal_temporal_positioning) / static_cast<float>(fps);
+        for (int64_t h = 0; h < height; h++) {
+            float h_start = static_cast<float>(h * spatial_scale);
+            float h_end   = static_cast<float>((h + 1) * spatial_scale);
+            for (int64_t w = 0; w < width; w++) {
+                float w_start = static_cast<float>(w * spatial_scale);
+                float w_end   = static_cast<float>((w + 1) * spatial_scale);
+                set_ltxv_video_position(&positions, token++, t_start, t_end, h_start, h_end, w_start, w_end);
+            }
+        }
+    }
+
+    for (int64_t t = 0; t < keyframe_latent_frames; t++) {
+        float t_start = static_cast<float>(keyframe_frame_idx + t * temporal_scale);
+        float t_end   = static_cast<float>(keyframe_frame_idx + (t + 1) * temporal_scale);
+        if (keyframe_pixel_frames == 1) {
+            t_end = t_start + 1.f;
+        }
+        t_start /= static_cast<float>(fps);
+        t_end /= static_cast<float>(fps);
+        for (int64_t h = 0; h < height; h++) {
+            float h_start = static_cast<float>(h * spatial_scale);
+            float h_end   = static_cast<float>((h + 1) * spatial_scale);
+            for (int64_t w = 0; w < width; w++) {
+                float w_start = static_cast<float>(w * spatial_scale);
+                float w_end   = static_cast<float>((w + 1) * spatial_scale);
+                set_ltxv_video_position(&positions, token++, t_start, t_end, h_start, h_end, w_start, w_end);
+            }
+        }
+    }
+
+    return positions;
+}
+
 static sd::Tensor<float> pack_ltxav_audio_and_video_latents(const sd::Tensor<float>& video_latent,
                                                            const sd::Tensor<float>& audio_latent) {
    if (audio_latent.empty()) {
@ -4151,33 +4236,27 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
    }

    if (sd_version_is_ltxav(sd_ctx->sd->version)) {
-        if (!end_image.empty() || sd_vid_gen_params->control_frames_size > 0) {
-            LOG_ERROR("LTXAV currently supports txt2vid and init_image i2v only; end_image and control_frames are not implemented");
+        if (sd_vid_gen_params->control_frames_size > 0) {
+            LOG_ERROR("LTXAV control_frames are not implemented");
            return std::nullopt;
        }

-        if (!start_image.empty()) {
+        if (!start_image.empty() || !end_image.empty()) {
            if (sd_ctx->sd->vae_decode_only) {
-                LOG_ERROR("LTXAV init_image i2v requires VAE encoder weights; create the context with vae_decode_only=false");
+                LOG_ERROR("LTXAV image conditioning requires VAE encoder weights; create the context with vae_decode_only=false");
                return std::nullopt;
            }

-            LOG_INFO("IMG2VID");
-
-            int64_t t1             = ggml_time_ms();
-            auto init_img          = start_image.reshape({start_image.shape()[0],
-                                                          start_image.shape()[1],
-                                                          1,
-                                                          start_image.shape()[2],
-                                                          start_image.shape()[3]});
-            auto init_image_latent = sd_ctx->sd->encode_first_stage(init_img);
-            if (init_image_latent.empty()) {
-                LOG_ERROR("failed to encode LTXAV init image");
-                return std::nullopt;
+            if (!start_image.empty() && !end_image.empty()) {
+                LOG_INFO("FLF2V");
+            } else if (!start_image.empty()) {
+                LOG_INFO("IMG2VID");
+            } else {
+                LOG_INFO("END2VID");
            }

+            int64_t t1          = ggml_time_ms();
            latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
-            sd::ops::slice_assign(&latents.init_latent, 2, 0, init_image_latent.shape()[2], init_image_latent);

            float conditioning_strength = std::clamp(request->strength, 0.f, 1.f);
            float conditioned_mask      = 1.0f - conditioning_strength;
@ -4187,7 +4266,94 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
                                                           1,
                                                           1},
                                                   1.f);
-            sd::ops::fill_slice(&latents.denoise_mask, 2, 0, init_image_latent.shape()[2], conditioned_mask);
+
+            auto encode_ltxav_condition_image = [&](const sd::Tensor<float>& image, const char* name) -> sd::Tensor<float> {
+                auto condition_image  = image.reshape({image.shape()[0],
+                                                       image.shape()[1],
+                                                       1,
+                                                       image.shape()[2],
+                                                       image.shape()[3]});
+                auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
+                if (condition_latent.empty()) {
+                    LOG_ERROR("failed to encode LTXAV %s image", name);
+                }
+                return condition_latent;
+            };
+
+            auto apply_video_condition_by_latent_index = [&](const sd::Tensor<float>& condition_latent,
+                                                             int64_t latent_idx,
+                                                             const char* name) -> bool {
+                int64_t latent_frames    = latents.init_latent.shape()[2];
+                int64_t condition_frames = condition_latent.shape()[2];
+                if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
+                    LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
+                              name,
+                              latent_idx,
+                              condition_frames,
+                              latent_frames);
+                    return false;
+                }
+
+                sd::ops::slice_assign(&latents.init_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
+                sd::ops::fill_slice(&latents.denoise_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
+                return true;
+            };
+
+            auto apply_video_condition_by_keyframe_index = [&](const sd::Tensor<float>& keyframes,
+                                                               int frame_idx,
+                                                               const char* name) -> bool {
+                int64_t keyframe_frames = keyframes.shape()[2];
+                if (keyframe_frames <= 0 || keyframes.shape()[0] != latents.init_latent.shape()[0] ||
+                    keyframes.shape()[1] != latents.init_latent.shape()[1] ||
+                    keyframes.shape()[3] != latents.init_latent.shape()[3]) {
+                    LOG_ERROR("invalid LTXAV %s keyframe latent shape", name);
+                    return false;
+                }
+
+                latents.video_target_frame_count       = latents.init_latent.shape()[2];
+                latents.video_conditioning_frame_count = keyframe_frames;
+                latents.init_latent                    = sd::ops::concat(latents.init_latent, keyframes, 2);
+
+                auto keyframe_mask      = sd::full<float>({keyframes.shape()[0],
+                                                           keyframes.shape()[1],
+                                                           keyframes.shape()[2],
+                                                           1,
+                                                           1},
+                                                     conditioned_mask);
+                latents.denoise_mask    = sd::ops::concat(latents.denoise_mask, keyframe_mask, 2);
+                latents.video_positions = build_ltxv_video_positions(latents.init_latent.shape()[0],
+                                                                     latents.init_latent.shape()[1],
+                                                                     latents.video_target_frame_count,
+                                                                     keyframe_frames,
+                                                                     frame_idx,
+                                                                     1,
+                                                                     request->fps,
+                                                                     request->vae_scale_factor,
+                                                                     8,
+                                                                     true);
+                return true;
+            };
+
+            if (!start_image.empty()) {
+                auto start_image_latent = encode_ltxav_condition_image(start_image, "init");
+                if (start_image_latent.empty() || !apply_video_condition_by_latent_index(start_image_latent, 0, "init")) {
+                    return std::nullopt;
+                }
+            }
+
+            if (!end_image.empty()) {
+                auto end_image_latent = encode_ltxav_condition_image(end_image, "end");
+                if (end_image_latent.empty()) {
+                    return std::nullopt;
+                }
+
+                int frame_idx = request->frames - 1;
+                bool ok       = frame_idx == 0 ? apply_video_condition_by_latent_index(end_image_latent, 0, "end")
+                                               : apply_video_condition_by_keyframe_index(end_image_latent, frame_idx, "end");
+                if (!ok) {
+                    return std::nullopt;
+                }
+            }

            int64_t t2 = ggml_time_ms();
            LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
@ -4543,7 +4709,8 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
                                                           request.vace_strength,
                                                           latents.audio_length,
                                                           static_cast<float>(request.fps),
-                                                           request.cache_params);
+                                                           request.cache_params,
+                                                           latents.video_positions);
        int64_t sampling_end          = ggml_time_ms();
        if (x_t_sampled.empty()) {
            LOG_ERROR("sampling(high noise) failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
@ -4588,7 +4755,8 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
                                                        request.vace_strength,
                                                        latents.audio_length,
                                                        static_cast<float>(request.fps),
-                                                        request.cache_params);
+                                                        request.cache_params,
+                                                        latents.video_positions);

    int64_t sampling_end = ggml_time_ms();
    if (sd_ctx->sd->free_params_immediately) {
@ -4617,6 +4785,12 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
        }
    }

+    if (latents.video_conditioning_frame_count > 0) {
+        int64_t target_frames = latents.video_target_frame_count > 0 ? latents.video_target_frame_count
+                                                                     : final_latent.shape()[2] - latents.video_conditioning_frame_count;
+        final_latent          = sd::ops::slice(final_latent, 2, 0, target_frames);
+    }
+
    if (latents.ref_image_num > 0) {
        final_latent = sd::ops::slice(final_latent, 2, latents.ref_image_num, final_latent.shape()[2]);
    }