feat: add ltx2.3 flf2v support (#1505)

This commit is contained in:
leejet 2026-05-17 18:40:14 +08:00 committed by GitHub
parent 06accf2b39
commit e43b24cf48
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 338 additions and 52 deletions

BIN
assets/ltx2/flf2v.webm Normal file

Binary file not shown.

View File

@ -38,4 +38,16 @@
src="../assets/ltx2/i2v.webm"
controls
muted
style="max-width: 100%; height: auto;"></video>
### LTX-2.3 dev FLF2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png -o flf2v.webm
```
<video
src="../assets/ltx2/flf2v.webm"
controls
muted
style="max-width: 100%; height: auto;"></video>

View File

@ -40,6 +40,7 @@ struct DiffusionParams {
float vace_strength = 1.f;
int audio_length = 0;
float frame_rate = 24.f;
const sd::Tensor<float>* video_positions = nullptr;
const std::vector<int>* skip_layers = nullptr;
};
@ -766,7 +767,8 @@ struct LTXAVModel : public DiffusionModel {
tensor_or_empty(diffusion_params.audio_x),
tensor_or_empty(diffusion_params.audio_timesteps),
diffusion_params.audio_length,
diffusion_params.frame_rate);
diffusion_params.frame_rate,
tensor_or_empty(diffusion_params.video_positions));
}
};

View File

@ -243,6 +243,56 @@ namespace LTXV {
return build_rope_matrix_from_frequencies(freqs, dim);
}
__STATIC_INLINE__ std::vector<float> build_video_rope_matrix_from_positions(const sd::Tensor<float>& positions,
int dim,
int num_heads,
float theta,
const std::vector<int>& max_pos,
bool use_middle_indices_grid) {
GGML_ASSERT(max_pos.size() == 3);
GGML_ASSERT(dim % num_heads == 0);
GGML_ASSERT(positions.dim() == 3 || positions.dim() == 4);
GGML_ASSERT(positions.shape()[0] == 2);
GGML_ASSERT(positions.shape()[1] == 3);
if (positions.dim() == 4) {
GGML_ASSERT(positions.shape()[3] == 1);
}
const int64_t tokens = positions.shape()[2];
const std::vector<float> indices = generate_freq_grid(theta, 3, dim);
const int half_dim = dim / 2;
const int pad_size = half_dim - static_cast<int>(indices.size()) * 3;
std::vector<std::vector<float>> freqs(static_cast<size_t>(tokens), std::vector<float>(half_dim, 0.f));
for (int64_t token = 0; token < tokens; token++) {
int out_idx = 0;
for (int i = 0; i < pad_size; i++) {
freqs[token][out_idx++] = 0.f;
}
float coords[3];
for (int axis = 0; axis < 3; axis++) {
float start = positions.dim() == 4 ? positions.index(0, axis, token, 0)
: positions.index(0, axis, token);
float end = positions.dim() == 4 ? positions.index(1, axis, token, 0)
: positions.index(1, axis, token);
float coord = use_middle_indices_grid ? 0.5f * (start + end) : start;
coords[axis] = coord / static_cast<float>(max_pos[axis]);
}
for (float index : indices) {
for (int axis = 0; axis < 3; axis++) {
freqs[token][out_idx++] = index * (coords[axis] * 2.f - 1.f);
}
}
}
if (num_heads > 1) {
return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
}
return build_rope_matrix_from_frequencies(freqs, dim);
}
__STATIC_INLINE__ std::vector<float> build_1d_rope_matrix(int64_t seq_len,
int dim,
int num_heads = 1,
@ -848,6 +898,31 @@ namespace LTXV {
return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
}
__STATIC_INLINE__ std::vector<float> build_video_temporal_rope_matrix_from_positions(const sd::Tensor<float>& positions,
int dim,
int num_heads,
float theta,
int max_pos_t,
bool use_middle_indices_grid) {
GGML_ASSERT(positions.dim() == 3 || positions.dim() == 4);
GGML_ASSERT(positions.shape()[0] == 2);
GGML_ASSERT(positions.shape()[1] >= 1);
if (positions.dim() == 4) {
GGML_ASSERT(positions.shape()[3] == 1);
}
std::vector<float> coords;
coords.reserve(static_cast<size_t>(positions.shape()[2]));
for (int64_t token = 0; token < positions.shape()[2]; token++) {
float start = positions.dim() == 4 ? positions.index(0, 0, token, 0)
: positions.index(0, 0, token);
float end = positions.dim() == 4 ? positions.index(1, 0, token, 0)
: positions.index(1, 0, token);
coords.push_back(use_middle_indices_grid ? 0.5f * (start + end) : start);
}
return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
}
__STATIC_INLINE__ float audio_latent_start_time_sec(int64_t latent_index,
int audio_latent_downsample_factor = 4,
int hop_length = 160,
@ -1664,7 +1739,8 @@ namespace LTXV {
const sd::Tensor<float>& audio_x_tensor = {},
const sd::Tensor<float>& audio_timesteps_tensor = {},
int audio_length = 0,
float frame_rate = 24.f) {
float frame_rate = 24.f,
const sd::Tensor<float>& video_positions_tensor = {}) {
auto split_inputs = split_av_latents(x_tensor, audio_length);
vx_input_cache = split_inputs.first;
if (!audio_x_tensor.empty()) {
@ -1681,19 +1757,31 @@ namespace LTXV {
ggml_cgraph* gf = new_graph_custom(LTXAV_GRAPH_SIZE);
float video_frame_rate = frame_rate > 0.f ? frame_rate : 24.f;
video_pe_vec = build_video_rope_matrix(vx->ne[0],
vx->ne[1],
vx->ne[2],
static_cast<int>(params.hidden_size),
static_cast<int>(params.num_attention_heads),
video_frame_rate,
params.positional_embedding_theta,
params.positional_embedding_max_pos,
params.vae_scale_factors,
params.causal_temporal_positioning,
params.use_middle_indices_grid);
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.num_attention_heads);
float video_frame_rate = frame_rate > 0.f ? frame_rate : 24.f;
int64_t video_token_count = vx->ne[0] * vx->ne[1] * vx->ne[2];
bool has_video_positions = !video_positions_tensor.empty();
if (has_video_positions) {
GGML_ASSERT(video_positions_tensor.shape()[2] == video_token_count);
video_pe_vec = build_video_rope_matrix_from_positions(video_positions_tensor,
static_cast<int>(params.hidden_size),
static_cast<int>(params.num_attention_heads),
params.positional_embedding_theta,
params.positional_embedding_max_pos,
params.use_middle_indices_grid);
} else {
video_pe_vec = build_video_rope_matrix(vx->ne[0],
vx->ne[1],
vx->ne[2],
static_cast<int>(params.hidden_size),
static_cast<int>(params.num_attention_heads),
video_frame_rate,
params.positional_embedding_theta,
params.positional_embedding_max_pos,
params.vae_scale_factors,
params.causal_temporal_positioning,
params.use_middle_indices_grid);
}
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, video_token_count * params.num_attention_heads);
ggml_set_name(video_pe, "ltxav_video_pe");
set_backend_tensor_data(video_pe, video_pe_vec.data());
@ -1712,18 +1800,27 @@ namespace LTXV {
set_backend_tensor_data(audio_pe, audio_pe_vec.data());
int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]);
video_cross_pe_vec = build_video_temporal_rope_matrix(vx->ne[0],
vx->ne[1],
vx->ne[2],
static_cast<int>(params.audio_cross_attention_dim),
static_cast<int>(params.audio_num_attention_heads),
video_frame_rate,
params.positional_embedding_theta,
temporal_max_pos,
std::get<0>(params.vae_scale_factors),
params.causal_temporal_positioning,
true);
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.audio_num_attention_heads);
if (has_video_positions) {
video_cross_pe_vec = build_video_temporal_rope_matrix_from_positions(video_positions_tensor,
static_cast<int>(params.audio_cross_attention_dim),
static_cast<int>(params.audio_num_attention_heads),
params.positional_embedding_theta,
temporal_max_pos,
true);
} else {
video_cross_pe_vec = build_video_temporal_rope_matrix(vx->ne[0],
vx->ne[1],
vx->ne[2],
static_cast<int>(params.audio_cross_attention_dim),
static_cast<int>(params.audio_num_attention_heads),
video_frame_rate,
params.positional_embedding_theta,
temporal_max_pos,
std::get<0>(params.vae_scale_factors),
params.causal_temporal_positioning,
true);
}
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, video_token_count * params.audio_num_attention_heads);
ggml_set_name(video_cross_pe, "ltxav_video_cross_pe");
set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data());
@ -1806,9 +1903,10 @@ namespace LTXV {
const sd::Tensor<float>& audio_x = {},
const sd::Tensor<float>& audio_timesteps = {},
int audio_length = 0,
float frame_rate = 24.f) {
float frame_rate = 24.f,
const sd::Tensor<float>& video_positions = {}) {
auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length, frame_rate);
return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length, frame_rate, video_positions);
};
auto out = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
return out;

View File

@ -1842,7 +1842,8 @@ public:
float vace_strength,
int audio_length,
float frame_rate,
const sd_cache_params_t* cache_params) {
const sd_cache_params_t* cache_params,
const sd::Tensor<float>& video_positions = {}) {
std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
float cfg_scale = guidance.txt_cfg;
float img_cfg_scale = guidance.img_cfg;
@ -1948,6 +1949,7 @@ public:
diffusion_params.vace_strength = vace_strength;
diffusion_params.audio_length = audio_length;
diffusion_params.frame_rate = frame_rate;
diffusion_params.video_positions = video_positions.empty() ? nullptr : &video_positions;
diffusion_params.skip_layers = nullptr;
compute_sample_controls(control_image,
@ -3231,16 +3233,99 @@ struct ImageGenerationLatents {
sd::Tensor<float> concat_latent;
sd::Tensor<float> uncond_concat_latent;
sd::Tensor<float> audio_latent;
sd::Tensor<float> video_positions;
sd::Tensor<float> control_image;
std::vector<sd::Tensor<float>> ref_images;
std::vector<sd::Tensor<float>> ref_latents;
sd::Tensor<float> denoise_mask;
sd::Tensor<float> clip_vision_output;
sd::Tensor<float> vace_context;
int64_t ref_image_num = 0;
int audio_length = 0;
int64_t ref_image_num = 0;
int64_t video_conditioning_frame_count = 0;
int64_t video_target_frame_count = 0;
int audio_length = 0;
};
static float ltxv_latent_corner_to_pixel_frame(int64_t corner_index,
int temporal_scale,
bool causal_temporal_positioning) {
float pixel_t = static_cast<float>(corner_index * temporal_scale);
if (causal_temporal_positioning) {
pixel_t = std::max(0.f, pixel_t + 1.f - static_cast<float>(temporal_scale));
}
return pixel_t;
}
static void set_ltxv_video_position(sd::Tensor<float>* positions,
int64_t token,
float t_start,
float t_end,
float h_start,
float h_end,
float w_start,
float w_end) {
positions->index(0, 0, token, 0) = t_start;
positions->index(1, 0, token, 0) = t_end;
positions->index(0, 1, token, 0) = h_start;
positions->index(1, 1, token, 0) = h_end;
positions->index(0, 2, token, 0) = w_start;
positions->index(1, 2, token, 0) = w_end;
}
static sd::Tensor<float> build_ltxv_video_positions(int64_t width,
int64_t height,
int64_t target_latent_frames,
int64_t keyframe_latent_frames,
int keyframe_frame_idx,
int keyframe_pixel_frames,
int fps,
int spatial_scale,
int temporal_scale,
bool causal_temporal_positioning) {
GGML_ASSERT(width > 0 && height > 0 && target_latent_frames > 0);
GGML_ASSERT(keyframe_latent_frames > 0);
GGML_ASSERT(fps > 0);
int64_t total_tokens = width * height * (target_latent_frames + keyframe_latent_frames);
sd::Tensor<float> positions({2, 3, total_tokens, 1});
int64_t token = 0;
for (int64_t t = 0; t < target_latent_frames; t++) {
float t_start = ltxv_latent_corner_to_pixel_frame(t, temporal_scale, causal_temporal_positioning) / static_cast<float>(fps);
float t_end = ltxv_latent_corner_to_pixel_frame(t + 1, temporal_scale, causal_temporal_positioning) / static_cast<float>(fps);
for (int64_t h = 0; h < height; h++) {
float h_start = static_cast<float>(h * spatial_scale);
float h_end = static_cast<float>((h + 1) * spatial_scale);
for (int64_t w = 0; w < width; w++) {
float w_start = static_cast<float>(w * spatial_scale);
float w_end = static_cast<float>((w + 1) * spatial_scale);
set_ltxv_video_position(&positions, token++, t_start, t_end, h_start, h_end, w_start, w_end);
}
}
}
for (int64_t t = 0; t < keyframe_latent_frames; t++) {
float t_start = static_cast<float>(keyframe_frame_idx + t * temporal_scale);
float t_end = static_cast<float>(keyframe_frame_idx + (t + 1) * temporal_scale);
if (keyframe_pixel_frames == 1) {
t_end = t_start + 1.f;
}
t_start /= static_cast<float>(fps);
t_end /= static_cast<float>(fps);
for (int64_t h = 0; h < height; h++) {
float h_start = static_cast<float>(h * spatial_scale);
float h_end = static_cast<float>((h + 1) * spatial_scale);
for (int64_t w = 0; w < width; w++) {
float w_start = static_cast<float>(w * spatial_scale);
float w_end = static_cast<float>((w + 1) * spatial_scale);
set_ltxv_video_position(&positions, token++, t_start, t_end, h_start, h_end, w_start, w_end);
}
}
}
return positions;
}
static sd::Tensor<float> pack_ltxav_audio_and_video_latents(const sd::Tensor<float>& video_latent,
const sd::Tensor<float>& audio_latent) {
if (audio_latent.empty()) {
@ -4151,33 +4236,27 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
}
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
if (!end_image.empty() || sd_vid_gen_params->control_frames_size > 0) {
LOG_ERROR("LTXAV currently supports txt2vid and init_image i2v only; end_image and control_frames are not implemented");
if (sd_vid_gen_params->control_frames_size > 0) {
LOG_ERROR("LTXAV control_frames are not implemented");
return std::nullopt;
}
if (!start_image.empty()) {
if (!start_image.empty() || !end_image.empty()) {
if (sd_ctx->sd->vae_decode_only) {
LOG_ERROR("LTXAV init_image i2v requires VAE encoder weights; create the context with vae_decode_only=false");
LOG_ERROR("LTXAV image conditioning requires VAE encoder weights; create the context with vae_decode_only=false");
return std::nullopt;
}
LOG_INFO("IMG2VID");
int64_t t1 = ggml_time_ms();
auto init_img = start_image.reshape({start_image.shape()[0],
start_image.shape()[1],
1,
start_image.shape()[2],
start_image.shape()[3]});
auto init_image_latent = sd_ctx->sd->encode_first_stage(init_img);
if (init_image_latent.empty()) {
LOG_ERROR("failed to encode LTXAV init image");
return std::nullopt;
if (!start_image.empty() && !end_image.empty()) {
LOG_INFO("FLF2V");
} else if (!start_image.empty()) {
LOG_INFO("IMG2VID");
} else {
LOG_INFO("END2VID");
}
int64_t t1 = ggml_time_ms();
latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
sd::ops::slice_assign(&latents.init_latent, 2, 0, init_image_latent.shape()[2], init_image_latent);
float conditioning_strength = std::clamp(request->strength, 0.f, 1.f);
float conditioned_mask = 1.0f - conditioning_strength;
@ -4187,7 +4266,94 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
1,
1},
1.f);
sd::ops::fill_slice(&latents.denoise_mask, 2, 0, init_image_latent.shape()[2], conditioned_mask);
auto encode_ltxav_condition_image = [&](const sd::Tensor<float>& image, const char* name) -> sd::Tensor<float> {
auto condition_image = image.reshape({image.shape()[0],
image.shape()[1],
1,
image.shape()[2],
image.shape()[3]});
auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
if (condition_latent.empty()) {
LOG_ERROR("failed to encode LTXAV %s image", name);
}
return condition_latent;
};
auto apply_video_condition_by_latent_index = [&](const sd::Tensor<float>& condition_latent,
int64_t latent_idx,
const char* name) -> bool {
int64_t latent_frames = latents.init_latent.shape()[2];
int64_t condition_frames = condition_latent.shape()[2];
if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
name,
latent_idx,
condition_frames,
latent_frames);
return false;
}
sd::ops::slice_assign(&latents.init_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
sd::ops::fill_slice(&latents.denoise_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
return true;
};
auto apply_video_condition_by_keyframe_index = [&](const sd::Tensor<float>& keyframes,
int frame_idx,
const char* name) -> bool {
int64_t keyframe_frames = keyframes.shape()[2];
if (keyframe_frames <= 0 || keyframes.shape()[0] != latents.init_latent.shape()[0] ||
keyframes.shape()[1] != latents.init_latent.shape()[1] ||
keyframes.shape()[3] != latents.init_latent.shape()[3]) {
LOG_ERROR("invalid LTXAV %s keyframe latent shape", name);
return false;
}
latents.video_target_frame_count = latents.init_latent.shape()[2];
latents.video_conditioning_frame_count = keyframe_frames;
latents.init_latent = sd::ops::concat(latents.init_latent, keyframes, 2);
auto keyframe_mask = sd::full<float>({keyframes.shape()[0],
keyframes.shape()[1],
keyframes.shape()[2],
1,
1},
conditioned_mask);
latents.denoise_mask = sd::ops::concat(latents.denoise_mask, keyframe_mask, 2);
latents.video_positions = build_ltxv_video_positions(latents.init_latent.shape()[0],
latents.init_latent.shape()[1],
latents.video_target_frame_count,
keyframe_frames,
frame_idx,
1,
request->fps,
request->vae_scale_factor,
8,
true);
return true;
};
if (!start_image.empty()) {
auto start_image_latent = encode_ltxav_condition_image(start_image, "init");
if (start_image_latent.empty() || !apply_video_condition_by_latent_index(start_image_latent, 0, "init")) {
return std::nullopt;
}
}
if (!end_image.empty()) {
auto end_image_latent = encode_ltxav_condition_image(end_image, "end");
if (end_image_latent.empty()) {
return std::nullopt;
}
int frame_idx = request->frames - 1;
bool ok = frame_idx == 0 ? apply_video_condition_by_latent_index(end_image_latent, 0, "end")
: apply_video_condition_by_keyframe_index(end_image_latent, frame_idx, "end");
if (!ok) {
return std::nullopt;
}
}
int64_t t2 = ggml_time_ms();
LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
@ -4543,7 +4709,8 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
request.vace_strength,
latents.audio_length,
static_cast<float>(request.fps),
request.cache_params);
request.cache_params,
latents.video_positions);
int64_t sampling_end = ggml_time_ms();
if (x_t_sampled.empty()) {
LOG_ERROR("sampling(high noise) failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
@ -4588,7 +4755,8 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
request.vace_strength,
latents.audio_length,
static_cast<float>(request.fps),
request.cache_params);
request.cache_params,
latents.video_positions);
int64_t sampling_end = ggml_time_ms();
if (sd_ctx->sd->free_params_immediately) {
@ -4617,6 +4785,12 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
}
}
if (latents.video_conditioning_frame_count > 0) {
int64_t target_frames = latents.video_target_frame_count > 0 ? latents.video_target_frame_count
: final_latent.shape()[2] - latents.video_conditioning_frame_count;
final_latent = sd::ops::slice(final_latent, 2, 0, target_frames);
}
if (latents.ref_image_num > 0) {
final_latent = sd::ops::slice(final_latent, 2, latents.ref_image_num, final_latent.shape()[2]);
}