From e744e1e4e2c8897427a4cf61808e7bb80b8ed9b5 Mon Sep 17 00:00:00 2001 From: leejet Date: Wed, 29 Apr 2026 22:22:34 +0800 Subject: [PATCH] add temporal tiling support --- examples/cli/README.md | 1 + examples/common/common.cpp | 9 ++ examples/common/common.h | 2 +- include/stable-diffusion.h | 1 + src/ggml_extend.hpp | 30 +++++ src/ltx_vae.hpp | 266 ++++++++++++++++++++++++++++++++++--- src/ltxv.hpp | 12 +- src/stable-diffusion.cpp | 12 +- src/vae.hpp | 1 + 9 files changed, 304 insertions(+), 30 deletions(-) diff --git a/examples/cli/README.md b/examples/cli/README.md index 7b620fee..1459206a 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -156,6 +156,7 @@ Generation Options: --disable-auto-resize-ref-image disable auto resize of ref images --disable-image-metadata do not embed generation metadata on image files --vae-tiling process vae in tiles to reduce memory usage + --temporal-tiling enable temporal tiling for LTX video VAE decode --hires enable highres fix -s, --seed RNG seed (default: 42, use random seed for < 0) --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 80a37246..a251b9ce 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -989,6 +989,11 @@ ArgOptions SDGenerationParams::get_options() { "process vae in tiles to reduce memory usage", true, &vae_tiling_params.enabled}, + {"", + "--temporal-tiling", + "enable temporal tiling for LTX video VAE decode", + true, + &vae_tiling_params.temporal_tiling}, {"", "--hires", "enable highres fix", @@ -1681,6 +1686,9 @@ bool SDGenerationParams::from_json_str( if (tiling_json.contains("enabled") && tiling_json["enabled"].is_boolean()) { vae_tiling_params.enabled = tiling_json["enabled"]; } + if (tiling_json.contains("temporal_tiling") && tiling_json["temporal_tiling"].is_boolean()) { + vae_tiling_params.temporal_tiling = tiling_json["temporal_tiling"]; + } if (tiling_json.contains("tile_size_x") && tiling_json["tile_size_x"].is_number_integer()) { vae_tiling_params.tile_size_x = tiling_json["tile_size_x"]; } @@ -2275,6 +2283,7 @@ std::string SDGenerationParams::to_string() const { << ", upscale_tile_size: " << hires_upscale_tile_size << " },\n" << " vae_tiling_params: { " << vae_tiling_params.enabled << ", " + << vae_tiling_params.temporal_tiling << ", " << vae_tiling_params.tile_size_x << ", " << vae_tiling_params.tile_size_y << ", " << vae_tiling_params.target_overlap << ", " diff --git a/examples/common/common.h b/examples/common/common.h index bdd8246d..b7d867d7 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -183,7 +183,7 @@ struct SDGenerationParams { int video_frames = 1; int fps = 16; float vace_strength = 1.f; - sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; + sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f}; std::string pm_id_images_dir; std::string pm_id_embed_path; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 14c26a3f..2fd46dbb 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -149,6 +149,7 @@ enum lora_apply_mode_t { typedef struct { bool enabled; + bool temporal_tiling; int tile_size_x; int tile_size_y; float target_overlap; diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 1053d24d..e0194905 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -1683,6 +1683,8 @@ struct GGMLRunnerContext { bool circular_y_enabled = false; std::shared_ptr weight_adapter = nullptr; std::unordered_map* debug_tensors = nullptr; + std::function get_cache_tensor; + std::function cache_tensor; void capture_tensor(const std::string& name, ggml_tensor* tensor) { if (debug_tensors == nullptr || tensor == nullptr) { @@ -1691,6 +1693,20 @@ struct GGMLRunnerContext { ggml_set_output(tensor); (*debug_tensors)[tensor] = name; } + + ggml_tensor* load_cache_tensor(const std::string& name) const { + if (!get_cache_tensor) { + return nullptr; + } + return get_cache_tensor(name); + } + + void persist_cache_tensor(const std::string& name, ggml_tensor* tensor) const { + if (!cache_tensor || tensor == nullptr) { + return; + } + cache_tensor(name, tensor); + } }; struct GGMLRunner { @@ -1850,6 +1866,11 @@ protected: ggml_build_forward_expand(gf, entry.first); } } + for (const auto& entry : cache_tensor_map) { + if (entry.second != nullptr) { + ggml_build_forward_expand(gf, entry.second); + } + } prepare_build_in_tensor_after(gf); return gf; } @@ -2057,6 +2078,12 @@ public: runner_ctx.circular_y_enabled = circular_y_enabled; runner_ctx.weight_adapter = weight_adapter; runner_ctx.debug_tensors = &debug_tensors; + runner_ctx.get_cache_tensor = [this](const std::string& name) { + return this->get_cache_tensor_by_name(name); + }; + runner_ctx.cache_tensor = [this](const std::string& name, ggml_tensor* tensor) { + this->cache(name, tensor); + }; return runner_ctx; } @@ -2156,6 +2183,9 @@ public: } void cache(const std::string name, ggml_tensor* tensor) { + if (tensor != nullptr && tensor->view_src != nullptr) { + tensor = ggml_cont(compute_ctx, tensor); + } cache_tensor_map[name] = tensor; } diff --git a/src/ltx_vae.hpp b/src/ltx_vae.hpp index baefe4e1..8bcc1ca8 100644 --- a/src/ltx_vae.hpp +++ b/src/ltx_vae.hpp @@ -130,6 +130,56 @@ namespace LTXVAE { } return conv->forward(ctx, x); } + + // Chunked forward: uses feat_map to carry temporal context across frames. + // feat_map[feat_idx] holds the last `pad` frames from the previous chunk at + // this layer. nullptr means first chunk → fall back to repeat-first-frame. + // The cache entry is a contiguous copy (not a view) so that the large + // intermediate tensor `x` can be freed by GGML after this iteration ends. + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + std::vector& feat_map, + int& feat_idx, + int chunk_idx, + bool causal = true) { + auto conv = std::dynamic_pointer_cast(blocks["conv"]); + const int pad = causal ? (time_kernel_size - 1) : (time_kernel_size - 1) / 2; + + ggml_tensor* prev = (feat_idx < (int)feat_map.size()) ? feat_map[feat_idx] : nullptr; + + // Save a contiguous copy of the last `pad` frames so the large `x` + // tensor is not kept alive across iterations by a dangling view. + if (feat_idx < (int)feat_map.size() && pad > 0 && x->ne[2] >= pad) { + auto slice = ggml_ext_slice(ctx->ggml_ctx, x, 2, x->ne[2] - pad, x->ne[2]); + feat_map[feat_idx] = ggml_cont(ctx->ggml_ctx, slice); + } + feat_idx++; + + if (pad > 0) { + ggml_tensor* left_pad; + if (prev != nullptr) { + left_pad = prev; + } else { + auto first_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1); + left_pad = first_frame; + for (int i = 1; i < pad; i++) { + left_pad = ggml_concat(ctx->ggml_ctx, left_pad, first_frame, 2); + } + } + x = ggml_concat(ctx->ggml_ctx, left_pad, x, 2); + } + + if (!causal && pad > 0) { + auto last_frame = ggml_ext_slice(ctx->ggml_ctx, x, 2, x->ne[2] - 1, x->ne[2]); + auto right_pad = last_frame; + for (int i = 1; i < pad; i++) { + right_pad = ggml_concat(ctx->ggml_ctx, right_pad, last_frame, 2); + } + x = ggml_concat(ctx->ggml_ctx, x, right_pad, 2); + } + + return conv->forward(ctx, x); + } }; struct PixelNorm3D : public UnaryBlock { @@ -225,6 +275,51 @@ namespace LTXVAE { return ggml_add(ctx->ggml_ctx, h, x); } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + bool causal, + std::vector& feat_map, + int& feat_idx, + int chunk_idx) { + auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); + auto conv1 = std::dynamic_pointer_cast(blocks["conv1"]); + auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); + auto conv2 = std::dynamic_pointer_cast(blocks["conv2"]); + + ggml_tensor* shift1 = nullptr; + ggml_tensor* scale1 = nullptr; + ggml_tensor* shift2 = nullptr; + ggml_tensor* scale2 = nullptr; + if (timestep_conditioning) { + GGML_ASSERT(timestep != nullptr); + auto values = ggml_add(ctx->ggml_ctx, + params["scale_shift_table"], + ggml_reshape_2d(ctx->ggml_ctx, timestep, channels, 4)); + auto chunks = ggml_ext_chunk(ctx->ggml_ctx, values, 4, 1, false); + shift1 = reshape_channel_broadcast(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, chunks[0])); + scale1 = reshape_channel_broadcast(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, chunks[1])); + shift2 = reshape_channel_broadcast(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, chunks[2])); + scale2 = reshape_channel_broadcast(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, chunks[3])); + } + + auto h = norm1->forward(ctx, x); + if (timestep_conditioning) { + h = apply_scale_shift(ctx->ggml_ctx, h, scale1, shift1); + } + h = ggml_silu_inplace(ctx->ggml_ctx, h); + h = conv1->forward(ctx, h, feat_map, feat_idx, chunk_idx, causal); + + h = norm2->forward(ctx, h); + if (timestep_conditioning) { + h = apply_scale_shift(ctx->ggml_ctx, h, scale2, shift2); + } + h = ggml_silu_inplace(ctx->ggml_ctx, h); + h = conv2->forward(ctx, h, feat_map, feat_idx, chunk_idx, causal); + + return ggml_add(ctx->ggml_ctx, h, x); + } }; struct UNetMidBlock3D : public GGMLBlock { @@ -263,6 +358,26 @@ namespace LTXVAE { } return x; } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + bool causal, + std::vector& feat_map, + int& feat_idx, + int chunk_idx) { + ggml_tensor* timestep_embed = nullptr; + if (timestep_conditioning) { + GGML_ASSERT(timestep != nullptr); + auto time_embedder = std::dynamic_pointer_cast(blocks["time_embedder"]); + timestep_embed = time_embedder->forward(ctx, timestep); + } + for (int i = 0; i < num_layers; i++) { + auto resnet = std::dynamic_pointer_cast(blocks["res_blocks." + std::to_string(i)]); + x = resnet->forward(ctx, x, timestep_embed, causal, feat_map, feat_idx, chunk_idx); + } + return x; + } }; struct DepthToSpaceUpsample : public GGMLBlock { @@ -314,6 +429,35 @@ namespace LTXVAE { } return x; } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + bool causal, + std::vector& feat_map, + int& feat_idx, + int chunk_idx) { + auto conv = std::dynamic_pointer_cast(blocks["conv"]); + + bool drop_first = (chunk_idx == 0) && (factor_t > 1); + + ggml_tensor* x_in = nullptr; + if (residual) { + x_in = depth_to_space_3d(ctx->ggml_ctx, x, in_channels / (factor_t * factor_s * factor_s), factor_t, factor_s, drop_first); + int repeat = (factor_t * factor_s * factor_s) / out_channels_reduction_factor; + auto res = x_in; + for (int i = 1; i < repeat; i++) { + res = ggml_concat(ctx->ggml_ctx, res, x_in, 3); + } + x_in = res; + } + + x = conv->forward(ctx, x, feat_map, feat_idx, chunk_idx, causal); + x = depth_to_space_3d(ctx->ggml_ctx, x, get_output_channels(), factor_t, factor_s, drop_first); + if (residual) { + x = ggml_add(ctx->ggml_ctx, x, x_in); + } + return x; + } }; struct SpaceToDepthDownsample : public GGMLBlock { @@ -735,6 +879,61 @@ namespace LTXVAE { x = conv_out->forward(ctx, x, causal_decoder); return x; } + + // Process a single latent frame through the complete decoder (conv_in → up_blocks + // → final layers), using feat_map to carry per-layer causal context from the + // previous frame. Designed for tiled temporal decode: each iteration receives + // 1 latent frame so that intermediate tensors can be freed between iterations. + ggml_tensor* forward_tiled_frame(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + std::vector& feat_map, + int& feat_idx, + int chunk_idx) { + auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); + auto conv_norm_out = std::dynamic_pointer_cast(blocks["conv_norm_out"]); + auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); + + ggml_tensor* scaled_timestep = timestep; + if (timestep_conditioning && timestep != nullptr) { + auto multiplier = ggml_ext_backend_tensor_get_f32(params["timestep_scale_multiplier"]); + scaled_timestep = ggml_ext_scale(ctx->ggml_ctx, timestep, multiplier); + } + + // conv_in with feat_map for left temporal context + x = conv_in->forward(ctx, x, feat_map, feat_idx, chunk_idx, causal_decoder); + + // up_blocks + int block_idx = 0; + while (blocks.find("up_blocks." + std::to_string(block_idx)) != blocks.end()) { + auto mid_block = std::dynamic_pointer_cast(blocks["up_blocks." + std::to_string(block_idx)]); + if (mid_block) { + x = mid_block->forward(ctx, x, scaled_timestep, causal_decoder, + feat_map, feat_idx, chunk_idx); + } else { + auto upsample = std::dynamic_pointer_cast( + blocks["up_blocks." + std::to_string(block_idx)]); + x = upsample->forward(ctx, x, causal_decoder, + feat_map, feat_idx, chunk_idx); + } + block_idx++; + } + + x = conv_norm_out->forward(ctx, x); + if (timestep_conditioning) { + auto last_time_embedder = std::dynamic_pointer_cast(blocks["last_time_embedder"]); + auto timestep_embed = last_time_embedder->forward(ctx, scaled_timestep); + auto [shift, scale] = get_shift_scale(ctx->ggml_ctx, + params["last_scale_shift_table"], + timestep_embed, + hidden_channels, + 2); + x = apply_scale_shift(ctx->ggml_ctx, x, scale, shift); + } + x = ggml_silu_inplace(ctx->ggml_ctx, x); + x = conv_out->forward(ctx, x, feat_map, feat_idx, chunk_idx, causal_decoder); + return x; + } }; struct VideoVAE : public GGMLBlock { @@ -779,6 +978,39 @@ namespace LTXVAE { return out; } + // Tiled temporal decode: each latent frame is processed through the COMPLETE + // decoder individually. Per-layer causal context is passed via feat_map + // (contiguous copies, not views) so that each iteration's large intermediate + // tensors can be freed by GGML before the next iteration starts. + ggml_tensor* decode_tiled(GGMLRunnerContext* ctx, + ggml_tensor* z, + ggml_tensor* timestep) { + auto decoder = std::dynamic_pointer_cast(blocks["decoder"]); + auto processor = std::dynamic_pointer_cast(blocks["per_channel_statistics"]); + auto latents = processor->un_normalize(ctx, z); + + const int64_t T = z->ne[2]; + if (T <= 1) { + auto out = decoder->forward(ctx, latents, timestep); + return WAN::WanVAE::unpatchify(ctx->ggml_ctx, out, patch_size, 1); + } + + // feat_map holds ggml_tensor* nodes (contiguous copies at each conv layer). + // 128 slots is generous enough for any supported decoder configuration. + std::vector feat_map(128, nullptr); + + ggml_tensor* out = nullptr; + for (int i = 0; i < (int)T; i++) { + int feat_idx = 0; + auto z_i = ggml_ext_slice(ctx->ggml_ctx, latents, 2, i, i + 1); + auto out_i = decoder->forward_tiled_frame(ctx, z_i, timestep, + feat_map, feat_idx, i); + out = (out == nullptr) ? out_i : ggml_concat(ctx->ggml_ctx, out, out_i, 2); + } + + return WAN::WanVAE::unpatchify(ctx->ggml_ctx, out, patch_size, 1); + } + ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) { GGML_ASSERT(!decode_only); @@ -797,6 +1029,7 @@ namespace LTXVAE { struct LTXVideoVAE : public VAE { bool decode_only; + bool temporal_tiling_enabled = false; int ltx_vae_version; bool timestep_conditioning; int patch_size; @@ -829,30 +1062,31 @@ struct LTXVideoVAE : public VAE { return "ltx_video_vae"; } + void set_temporal_tiling_enabled(bool enabled) override { + temporal_tiling_enabled = enabled; + } + void get_param_tensors(std::map& tensors, const std::string prefix) override { vae.get_param_tensors(tensors, prefix); } ggml_cgraph* build_graph(const sd::Tensor& z_tensor, bool decode_graph) { - LOG_DEBUG("ltx_video_vae build_graph input %dx%dx%dx%d", - (int)z_tensor.shape()[0], - (int)z_tensor.shape()[1], - (int)z_tensor.shape()[2], - (int)z_tensor.shape()[3]); - ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_cgraph* gf = new_graph_custom(20480); ggml_tensor* z = make_input(z_tensor); ggml_tensor* timestep = nullptr; if (timestep_conditioning) { timestep = make_input(decode_timestep_tensor); } - auto runner_ctx = get_context(); - ggml_tensor* out = decode_graph ? vae.decode(&runner_ctx, z, timestep) : vae.encode(&runner_ctx, z); - LOG_DEBUG("ltx_video_vae build_graph output ne=[%lld,%lld,%lld,%lld]", - (long long)out->ne[0], - (long long)out->ne[1], - (long long)out->ne[2], - (long long)out->ne[3]); + auto runner_ctx = get_context(); + ggml_tensor* out; + bool use_tiled = decode_graph && temporal_tiling_enabled && + z_tensor.dim() == 5 && z_tensor.shape()[2] > 1; + if (use_tiled) { + out = vae.decode_tiled(&runner_ctx, z, timestep); + } else { + out = decode_graph ? vae.decode(&runner_ctx, z, timestep) : vae.encode(&runner_ctx, z); + } ggml_build_forward_expand(gf, out); return gf; @@ -889,12 +1123,6 @@ struct LTXVideoVAE : public VAE { if (result.empty()) { return {}; } - LOG_DEBUG("ltx_video_vae host output shape=[%lld,%lld,%lld,%lld] dim=%lld", - (long long)(result.shape().size() > 0 ? result.shape()[0] : 0), - (long long)(result.shape().size() > 1 ? result.shape()[1] : 0), - (long long)(result.shape().size() > 2 ? result.shape()[2] : 0), - (long long)(result.shape().size() > 3 ? result.shape()[3] : 0), - (long long)result.dim()); return result; } diff --git a/src/ltxv.hpp b/src/ltxv.hpp index c9eea924..7f5532ec 100644 --- a/src/ltxv.hpp +++ b/src/ltxv.hpp @@ -924,12 +924,12 @@ namespace LTXV { if (count < 0) { count = coeff - start; } - auto t = ggml_reshape_3d(ctx->ggml_ctx, timestep, dim, coeff, timestep->ne[1]); - auto s = ggml_reshape_3d(ctx->ggml_ctx, table, dim, coeff, 1); - auto e = ggml_new_tensor_3d(ctx->ggml_ctx, timestep->type, dim, coeff, timestep->ne[1]); - t = ggml_repeat(ctx->ggml_ctx, t, e); - s = ggml_repeat(ctx->ggml_ctx, s, e); - auto out = ggml_add(ctx->ggml_ctx, s, t); + auto t = ggml_reshape_3d(ctx->ggml_ctx, timestep, dim, coeff, timestep->ne[1]); + auto s = ggml_reshape_3d(ctx->ggml_ctx, table, dim, coeff, 1); + auto e = ggml_new_tensor_3d(ctx->ggml_ctx, timestep->type, dim, coeff, timestep->ne[1]); + t = ggml_repeat(ctx->ggml_ctx, t, e); + s = ggml_repeat(ctx->ggml_ctx, s, e); + auto out = ggml_add(ctx->ggml_ctx, s, t); auto chunks = ggml_ext_chunk(ctx->ggml_ctx, out, static_cast(coeff), 1); return std::vector(chunks.begin() + start, chunks.begin() + start + count); } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index bab6fee9..20576298 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -144,7 +144,7 @@ public: bool apply_lora_immediately = false; std::string taesd_path; - sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0}; + sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0}; bool offload_params_to_cpu = false; bool use_pmid = false; @@ -1520,9 +1520,11 @@ public: sd::Tensor decoded; bool is_video = preview_latent_tensor_is_video(latents); if (preview_vae) { + preview_vae->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling); vae_latents = preview_vae->diffusion_to_vae_latents(latents); decoded = preview_vae->decode(n_threads, vae_latents, vae_tiling_params, is_video, circular_x, circular_y, true); } else { + first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling); vae_latents = first_stage_model->diffusion_to_vae_latents(latents); decoded = first_stage_model->decode(n_threads, vae_latents, vae_tiling_params, is_video, circular_x, circular_y, true); } @@ -1963,6 +1965,7 @@ public: sd::Tensor decode_first_stage(const sd::Tensor& x, bool decode_video = false) { auto latents = first_stage_model->diffusion_to_vae_latents(x); + first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling); return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); } @@ -2398,7 +2401,7 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) { sd_img_gen_params->batch_count = 1; sd_img_gen_params->control_strength = 0.9f; sd_img_gen_params->pm_params = {nullptr, 0, nullptr, 20.f}; - sd_img_gen_params->vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; + sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f}; sd_cache_params_init(&sd_img_gen_params->cache); sd_hires_params_init(&sd_img_gen_params->hires); } @@ -2427,7 +2430,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { "increase_ref_index: %s\n" "control_strength: %.2f\n" "photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n" - "VAE tiling: %s\n" + "VAE tiling: %s (temporal=%s)\n" "hires: {enabled=%s, upscaler=%s, model_path=%s, scale=%.2f, target=%dx%d, steps=%d, denoising_strength=%.2f}\n", SAFE_STR(sd_img_gen_params->prompt), SAFE_STR(sd_img_gen_params->negative_prompt), @@ -2446,6 +2449,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { sd_img_gen_params->pm_params.id_images_count, SAFE_STR(sd_img_gen_params->pm_params.id_embed_path), BOOL_STR(sd_img_gen_params->vae_tiling_params.enabled), + BOOL_STR(sd_img_gen_params->vae_tiling_params.temporal_tiling), BOOL_STR(sd_img_gen_params->hires.enabled), sd_hires_upscaler_name(sd_img_gen_params->hires.upscaler), SAFE_STR(sd_img_gen_params->hires.model_path), @@ -2483,7 +2487,7 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { sd_vid_gen_params->fps = 16; sd_vid_gen_params->moe_boundary = 0.875f; sd_vid_gen_params->vace_strength = 1.f; - sd_vid_gen_params->vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; + sd_vid_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f}; sd_cache_params_init(&sd_vid_gen_params->cache); } diff --git a/src/vae.hpp b/src/vae.hpp index bf4966e5..f34ee5ce 100644 --- a/src/vae.hpp +++ b/src/vae.hpp @@ -214,6 +214,7 @@ public: virtual sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) = 0; virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); }; + virtual void set_temporal_tiling_enabled(bool enabled) { SD_UNUSED(enabled); }; }; struct FakeVAE : public VAE {