feat: add LTX spatial latent upscale hires support (#1533)

2026-06-09 15:56:39 +00:00 · 2026-05-20 22:27:09 +08:00 · 2026-05-20 22:27:09 +08:00 · b3374e6a71
commit b3374e6a71
parent bdd937f29a
12 changed files with 1073 additions and 137 deletions
--- a/assets/ltx2/hires_i2v.webm
+++ b/assets/ltx2/hires_i2v.webm
--- a/docs/ltx2.md
+++ b/docs/ltx2.md
@ -13,6 +13,8 @@
    - safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/vae
 - Download audio vae
    - safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/vae
 - Download LTX spatial latent upscaler
    - safetensors: https://huggingface.co/Lightricks/LTX-2.3/resolve/main/ltx-2.3-spatial-upscaler-x2-1.1.safetensors
 ## Examples
@ -50,4 +52,26 @@
  src="../assets/ltx2/flf2v.webm"
  controls
  muted
  style="max-width: 100%; height: auto;"></video>
 ### LTX-2.3 spatial latent upscale
 LTX spatial latent upscale runs a model-backed x2 latent upsampler between the low-resolution video pass and the high-resolution refine pass. `-W` and `-H` are the pre-upscale generation size; the spatial upsampler produces x2 latent dimensions.
 Put `ltx-2.3-spatial-upscaler-x2-1.1.safetensors` under the directory passed to `--hires-upscalers-dir`, then use the model name without path or extension in `--hires-upscaler`.
 ```
 .\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors --hires-upscalers-dir ..\..\ComfyUI\models\latent_upscale_models --hires-upscaler ltx-2.3-spatial-upscaler-x2-1.1 --hires --hires-steps 4 -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v  -W 640 -H 360 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\ernie_image\turbo_example.png -o hires_i2v.webm
 ```
 By default, the hires refine pass uses the main sampler and scheduler, then trims the second-pass sigma schedule by `--hires-denoising-strength` (`0.7` by default). To reproduce a ComfyUI-style explicit refine schedule, pass custom hires sigmas:
 ```
 --hires-sigmas "0.85,0.725,0.421875,0.0"
 ```
 <video
  src="../assets/ltx2/hires_i2v.webm"
  controls
  muted
  style="max-width: 100%; height: auto;"></video>
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -176,6 +176,8 @@ Generation Options:
                                           model-specific
  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g.,
                                           "14.61,7.8,3.5,0.0").
  --hires-sigmas                           custom sigma values for the highres fix second pass, comma-separated (e.g.,
                                           "0.85,0.725,0.421875,0.0").
  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@ -1134,11 +1134,11 @@ ArgOptions SDGenerationParams::get_options() {
        return 1;
    };
-    auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
+    auto parse_sigmas_arg = [&](const char* value, std::vector<float>* target, const char* option_name) {
-        if (++index >= argc) {
+        if (target == nullptr || value == nullptr) {
            return -1;
        }
-        std::string sigmas_str = argv[index];
+        std::string sigmas_str = value;
        if (!sigmas_str.empty() && sigmas_str.front() == '[') {
            sigmas_str.erase(0, 1);
        }
@ -1146,6 +1146,7 @@ ArgOptions SDGenerationParams::get_options() {
            sigmas_str.pop_back();
        }
        size_t before = target->size();
        std::stringstream ss(sigmas_str);
        std::string item;
        while (std::getline(ss, item, ',')) {
@ -1153,24 +1154,38 @@ ArgOptions SDGenerationParams::get_options() {
            item.erase(item.find_last_not_of(" \t\n\r\f\v") + 1);
            if (!item.empty()) {
                try {
-                    custom_sigmas.push_back(std::stof(item));
+                    target->push_back(std::stof(item));
                } catch (const std::invalid_argument&) {
-                    LOG_ERROR("error: invalid float value '%s' in --sigmas", item.c_str());
+                    LOG_ERROR("error: invalid float value '%s' in %s", item.c_str(), option_name);
                    return -1;
                } catch (const std::out_of_range&) {
-                    LOG_ERROR("error: float value '%s' out of range in --sigmas", item.c_str());
+                    LOG_ERROR("error: float value '%s' out of range in %s", item.c_str(), option_name);
                    return -1;
                }
            }
        }
-        if (custom_sigmas.empty() && !sigmas_str.empty()) {
+        if (target->size() == before && !sigmas_str.empty()) {
-            LOG_ERROR("error: could not parse any sigma values from '%s'", argv[index]);
+            LOG_ERROR("error: could not parse any sigma values from '%s'", value);
            return -1;
        }
        return 1;
    };
    auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
        if (++index >= argc) {
            return -1;
        }
        return parse_sigmas_arg(argv[index], &custom_sigmas, "--sigmas");
    };
    auto on_hires_sigmas_arg = [&](int argc, const char** argv, int index) {
        if (++index >= argc) {
            return -1;
        }
        return parse_sigmas_arg(argv[index], &hires_custom_sigmas, "--hires-sigmas");
    };
    auto on_ref_image_arg = [&](int argc, const char** argv, int index) {
        if (++index >= argc) {
            return -1;
@ -1293,6 +1308,10 @@ ArgOptions SDGenerationParams::get_options() {
         "--sigmas",
         "custom sigma values for the sampler, comma-separated (e.g., \"14.61,7.8,3.5,0.0\").",
         on_sigmas_arg},
        {"",
         "--hires-sigmas",
         "custom sigma values for the highres fix second pass, comma-separated (e.g., \"0.85,0.725,0.421875,0.0\").",
         on_hires_sigmas_arg},
        {"",
         "--skip-layers",
         "layers to skip for SLG steps (default: [7,8,9])",
@ -1525,11 +1544,31 @@ static bool resolve_model_file_from_dir(const std::string& model_name,
        LOG_ERROR("%s directory is empty", label);
        return false;
    }
    auto ends_with_valid_ext = [&]() {
        for (const auto& ext : valid_ext) {
            if (model_name.size() < ext.size()) {
                continue;
            }
            auto suffix = model_name.substr(model_name.size() - ext.size());
            std::transform(suffix.begin(), suffix.end(), suffix.begin(), [](unsigned char c) {
                return static_cast<char>(std::tolower(c));
            });
            std::string lower_ext = ext;
            std::transform(lower_ext.begin(), lower_ext.end(), lower_ext.begin(), [](unsigned char c) {
                return static_cast<char>(std::tolower(c));
            });
            if (suffix == lower_ext) {
                return true;
            }
        }
        return false;
    };
    if (model_name.empty() ||
        model_name.find('/') != std::string::npos ||
        model_name.find('\\') != std::string::npos ||
        fs::path(model_name).has_root_path() ||
-        fs::path(model_name).has_extension()) {
+        ends_with_valid_ext()) {
        LOG_ERROR("%s must be a model name without path or extension: %s", label, model_name.c_str());
        return false;
    }
@ -1633,6 +1672,9 @@ bool SDGenerationParams::from_json_str(
        if (hires_json.contains("denoising_strength") && hires_json["denoising_strength"].is_number()) {
            hires_denoising_strength = hires_json["denoising_strength"];
        }
        if (hires_json.contains("custom_sigmas") && hires_json["custom_sigmas"].is_array()) {
            hires_custom_sigmas = hires_json["custom_sigmas"].get<std::vector<float>>();
        }
        if (hires_json.contains("upscale_tile_size") && hires_json["upscale_tile_size"].is_number_integer()) {
            hires_upscale_tile_size = hires_json["upscale_tile_size"];
        }
@ -2080,6 +2122,10 @@ bool SDGenerationParams::validate(SDMode mode) {
            LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]");
            return false;
        }
        if (!hires_custom_sigmas.empty() && hires_custom_sigmas.size() < 2) {
            LOG_ERROR("error: hires custom sigmas must contain at least two values");
            return false;
        }
        if (hires_upscale_tile_size < 1) {
            LOG_ERROR("error: hires upscale tile size must be positive");
            return false;
@ -2174,15 +2220,17 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
    params.vae_tiling_params     = vae_tiling_params;
    params.cache                 = cache_params;
-    params.hires.enabled            = hires_enabled;
+    params.hires.enabled             = hires_enabled;
-    params.hires.upscaler           = resolved_hires_upscaler;
+    params.hires.upscaler            = resolved_hires_upscaler;
-    params.hires.model_path         = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
+    params.hires.model_path          = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
-    params.hires.scale              = hires_scale;
+    params.hires.scale               = hires_scale;
-    params.hires.target_width       = hires_width;
+    params.hires.target_width        = hires_width;
-    params.hires.target_height      = hires_height;
+    params.hires.target_height       = hires_height;
-    params.hires.steps              = hires_steps;
+    params.hires.steps               = hires_steps;
-    params.hires.denoising_strength = hires_denoising_strength;
+    params.hires.denoising_strength  = hires_denoising_strength;
-    params.hires.upscale_tile_size  = hires_upscale_tile_size;
+    params.hires.upscale_tile_size   = hires_upscale_tile_size;
    params.hires.custom_sigmas       = hires_custom_sigmas.empty() ? nullptr : hires_custom_sigmas.data();
    params.hires.custom_sigmas_count = static_cast<int>(hires_custom_sigmas.size());
    return params;
 }
@ -2215,27 +2263,38 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
    high_noise_sample_params.extra_sample_args        = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str();
    cache_params.scm_mask                             = scm_mask.empty() ? nullptr : scm_mask.c_str();
-    params.loras                    = lora_vec.empty() ? nullptr : lora_vec.data();
+    params.loras                     = lora_vec.empty() ? nullptr : lora_vec.data();
-    params.lora_count               = static_cast<uint32_t>(lora_vec.size());
+    params.lora_count                = static_cast<uint32_t>(lora_vec.size());
-    params.prompt                   = prompt.c_str();
+    params.prompt                    = prompt.c_str();
-    params.negative_prompt          = negative_prompt.c_str();
+    params.negative_prompt           = negative_prompt.c_str();
-    params.clip_skip                = clip_skip;
+    params.clip_skip                 = clip_skip;
-    params.init_image               = init_image.get();
+    params.init_image                = init_image.get();
-    params.end_image                = end_image.get();
+    params.end_image                 = end_image.get();
-    params.control_frames           = control_frame_views.empty() ? nullptr : control_frame_views.data();
+    params.control_frames            = control_frame_views.empty() ? nullptr : control_frame_views.data();
-    params.control_frames_size      = static_cast<int>(control_frame_views.size());
+    params.control_frames_size       = static_cast<int>(control_frame_views.size());
-    params.width                    = get_resolved_width();
+    params.width                     = get_resolved_width();
-    params.height                   = get_resolved_height();
+    params.height                    = get_resolved_height();
-    params.sample_params            = sample_params;
+    params.sample_params             = sample_params;
-    params.high_noise_sample_params = high_noise_sample_params;
+    params.high_noise_sample_params  = high_noise_sample_params;
-    params.moe_boundary             = moe_boundary;
+    params.moe_boundary              = moe_boundary;
-    params.strength                 = strength;
+    params.strength                  = strength;
-    params.seed                     = seed;
+    params.seed                      = seed;
-    params.video_frames             = video_frames;
+    params.video_frames              = video_frames;
-    params.fps                      = fps;
+    params.fps                       = fps;
-    params.vace_strength            = vace_strength;
+    params.vace_strength             = vace_strength;
-    params.vae_tiling_params        = vae_tiling_params;
+    params.vae_tiling_params         = vae_tiling_params;
-    params.cache                    = cache_params;
+    params.cache                     = cache_params;
    params.hires.enabled             = hires_enabled;
    params.hires.upscaler            = resolved_hires_upscaler;
    params.hires.model_path          = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
    params.hires.scale               = hires_scale;
    params.hires.target_width        = hires_width;
    params.hires.target_height       = hires_height;
    params.hires.steps               = hires_steps;
    params.hires.denoising_strength  = hires_denoising_strength;
    params.hires.upscale_tile_size   = hires_upscale_tile_size;
    params.hires.custom_sigmas       = hires_custom_sigmas.empty() ? nullptr : hires_custom_sigmas.data();
    params.hires.custom_sigmas_count = static_cast<int>(hires_custom_sigmas.size());
    return params;
 }
@ -2318,6 +2377,7 @@ std::string SDGenerationParams::to_string() const {
        << ", target_height: " << hires_height
        << ", steps: " << hires_steps
        << ", denoising_strength: " << hires_denoising_strength
        << ", custom_sigmas: " << vec_to_string(hires_custom_sigmas)
        << ", upscale_tile_size: " << hires_upscale_tile_size << " },\n"
        << "  vae_tiling_params: { "
        << vae_tiling_params.enabled << ", "
@ -2469,6 +2529,7 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
            {"target_height", gen_params.hires_height},
            {"steps", gen_params.hires_steps},
            {"denoising_strength", gen_params.hires_denoising_strength},
            {"custom_sigmas", gen_params.hires_custom_sigmas},
            {"upscale_tile_size", gen_params.hires_upscale_tile_size},
        };
    }
@ -2588,6 +2649,9 @@ std::string get_image_params(const SDContextParams& ctx_params,
        parameter_string += "Hires resize: " + std::to_string(gen_params.hires_width) + "x" + std::to_string(gen_params.hires_height) + ", ";
        parameter_string += "Hires steps: " + std::to_string(gen_params.hires_steps) + ", ";
        parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", ";
        if (!gen_params.hires_custom_sigmas.empty()) {
            parameter_string += "Hires custom sigmas: " + vec_to_string(gen_params.hires_custom_sigmas) + ", ";
        }
    }
    parameter_string += "Version: stable-diffusion.cpp";
    parameter_string += ", SDCPP: " + build_sdcpp_image_metadata_json(ctx_params, gen_params, seed, mode);
--- a/examples/common/common.h
+++ b/examples/common/common.h
@ -207,6 +207,7 @@ struct SDGenerationParams {
    int hires_steps                = 0;
    float hires_denoising_strength = 0.7f;
    int hires_upscale_tile_size    = 128;
    std::vector<float> hires_custom_sigmas;
    std::map<std::string, float> lora_map;
    std::map<std::string, float> high_noise_lora_map;
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -277,6 +277,8 @@ Default Generation Options:
                                           model-specific
  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g.,
                                           "14.61,7.8,3.5,0.0").
  --hires-sigmas                           custom sigma values for the highres fix second pass, comma-separated (e.g.,
                                           "0.85,0.725,0.421875,0.0").
  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
--- a/examples/server/api.md
+++ b/examples/server/api.md
@ -532,6 +532,7 @@ Shared default fields used by both `img_gen` and `vid_gen`:
 | `hires.target_height` | `integer` |
 | `hires.steps` | `integer` |
 | `hires.denoising_strength` | `number` |
 | `hires.custom_sigmas` | `array<number>` |
 | `hires.upscale_tile_size` | `integer` |
 `vid_gen`-specific default fields:
@ -685,6 +686,7 @@ Example:
    "target_height": 0,
    "steps": 0,
    "denoising_strength": 0.7,
    "custom_sigmas": [],
    "upscale_tile_size": 128
  },
@ -799,6 +801,7 @@ Other native fields:
 | `hires.target_height` | `integer` |
 | `hires.steps` | `integer` |
 | `hires.denoising_strength` | `number` |
 | `hires.custom_sigmas` | `array<number>` |
 | `hires.upscale_tile_size` | `integer` |
 | `vae_tiling_params` | `object` |
 | `cache_mode` | `string` |
@ -806,7 +809,7 @@ Other native fields:
 | `scm_mask` | `string` |
 | `scm_policy_dynamic` | `boolean` |
-For `hires.upscaler`, use `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, `Latent (bicubic antialiased)`, or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory.
+For `hires.upscaler`, use `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, `Latent (bicubic antialiased)`, or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory. `hires.custom_sigmas`, when present, overrides the generated second-pass hires sigma schedule; otherwise the hires schedule is trimmed by `hires.denoising_strength`.
 HTTP-only output fields:
--- a/examples/server/routes_sdcpp.cpp
+++ b/examples/server/routes_sdcpp.cpp
@ -100,6 +100,20 @@ static json make_sample_params_json(const sd_sample_params_t& sample_params, con
    };
 }
 static json make_hires_json(const SDGenerationParams& defaults) {
    return {
        {"enabled", defaults.hires_enabled},
        {"upscaler", defaults.hires_upscaler},
        {"scale", defaults.hires_scale},
        {"target_width", defaults.hires_width},
        {"target_height", defaults.hires_height},
        {"steps", defaults.hires_steps},
        {"denoising_strength", defaults.hires_denoising_strength},
        {"custom_sigmas", defaults.hires_custom_sigmas},
        {"upscale_tile_size", defaults.hires_upscale_tile_size},
    };
 }
 static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) {
    return {
        {"prompt", defaults.prompt},
@ -114,17 +128,7 @@ static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const
        {"increase_ref_index", defaults.increase_ref_index},
        {"control_strength", defaults.control_strength},
        {"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
-        {"hires",
+        {"hires", make_hires_json(defaults)},
         {
             {"enabled", defaults.hires_enabled},
             {"upscaler", defaults.hires_upscaler},
             {"scale", defaults.hires_scale},
             {"target_width", defaults.hires_width},
             {"target_height", defaults.hires_height},
             {"steps", defaults.hires_steps},
             {"denoising_strength", defaults.hires_denoising_strength},
             {"upscale_tile_size", defaults.hires_upscale_tile_size},
         }},
        {"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
        {"cache_mode", defaults.cache_mode},
        {"cache_option", defaults.cache_option},
@ -150,6 +154,7 @@ static json make_vid_gen_defaults_json(const SDGenerationParams& defaults, const
        {"vace_strength", defaults.vace_strength},
        {"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
        {"high_noise_sample_params", make_sample_params_json(defaults.high_noise_sample_params, defaults.high_noise_skip_layers)},
        {"hires", make_hires_json(defaults)},
        {"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
        {"cache_mode", defaults.cache_mode},
        {"cache_option", defaults.cache_option},
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -332,6 +332,8 @@ typedef struct {
    int steps;
    float denoising_strength;
    int upscale_tile_size;
    float* custom_sigmas;
    int custom_sigmas_count;
 } sd_hires_params_t;
 typedef struct {
@ -382,6 +384,7 @@ typedef struct {
    float vace_strength;
    sd_tiling_params_t vae_tiling_params;
    sd_cache_params_t cache;
    sd_hires_params_t hires;
 } sd_vid_gen_params_t;
 typedef struct sd_ctx_t sd_ctx_t;
--- a/src/ltx_latent_upscaler.hpp
+++ b/src/ltx_latent_upscaler.hpp
@ -0,0 +1,348 @@
 #ifndef __SD_LTX_LATENT_UPSCALER_HPP__
 #define __SD_LTX_LATENT_UPSCALER_HPP__
 #include <cinttypes>
 #include <cmath>
 #include <cstdlib>
 #include <map>
 #include <memory>
 #include <string>
 #include <utility>
 #include "common_dit.hpp"
 #include "ggml_extend.hpp"
 #include "ggml_graph_cut.h"
 #include "model.h"
 #include "util.h"
 namespace LTXVUpsampler {
    constexpr int LTX_UPSAMPLER_GRAPH_SIZE = 10240;
    struct LatentUpsamplerConfig {
        int64_t in_channels      = 128;
        int64_t mid_channels     = 1024;
        int num_blocks_per_stage = 4;
        int dims                 = 3;
        bool spatial_upsample    = true;
        bool temporal_upsample   = false;
        bool rational_resampler  = false;
    };
    static inline bool has_tensor(const String2TensorStorage& tensor_storage_map,
                                  const std::string& name) {
        return tensor_storage_map.find(name) != tensor_storage_map.end();
    }
    static inline int64_t get_tensor_ne0(const String2TensorStorage& tensor_storage_map,
                                         const std::string& name,
                                         int64_t fallback) {
        auto it = tensor_storage_map.find(name);
        if (it == tensor_storage_map.end()) {
            return fallback;
        }
        return it->second.ne[0];
    }
    static inline int count_module_blocks(const String2TensorStorage& tensor_storage_map,
                                          const std::string& module_name) {
        int max_block            = -1;
        const std::string prefix = module_name + ".";
        for (const auto& pair : tensor_storage_map) {
            const std::string& name = pair.first;
            if (name.find(prefix) != 0) {
                continue;
            }
            size_t begin = prefix.size();
            size_t end   = name.find('.', begin);
            if (end == std::string::npos) {
                continue;
            }
            int index = atoi(name.substr(begin, end - begin).c_str());
            max_block = std::max(max_block, index);
        }
        return max_block + 1;
    }
    static inline LatentUpsamplerConfig detect_config_from_weights(const String2TensorStorage& tensor_storage_map) {
        LatentUpsamplerConfig config;
        config.mid_channels = get_tensor_ne0(tensor_storage_map, "initial_norm.weight", config.mid_channels);
        config.in_channels  = get_tensor_ne0(tensor_storage_map, "final_conv.bias", config.in_channels);
        int detected_blocks = count_module_blocks(tensor_storage_map, "res_blocks");
        if (detected_blocks > 0) {
            config.num_blocks_per_stage = detected_blocks;
        }
        config.spatial_upsample  = has_tensor(tensor_storage_map, "upsampler.0.weight");
        config.temporal_upsample = has_tensor(tensor_storage_map, "temporal_upsampler.0.weight");
        return config;
    }
    class VideoGroupNorm : public GGMLBlock {
    protected:
        int num_groups;
        int64_t num_channels;
        float eps;
        std::string prefix;
        void init_params(ggml_context* ctx,
                         const String2TensorStorage& tensor_storage_map = {},
                         const std::string prefix                       = "") override {
            SD_UNUSED(tensor_storage_map);
            this->prefix     = prefix;
            params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_channels);
            params["bias"]   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_channels);
        }
    public:
        VideoGroupNorm(int num_groups, int64_t num_channels, float eps = 1e-05f)
            : num_groups(num_groups),
              num_channels(num_channels),
              eps(eps) {}
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
            // LTX video latent layout is [W, H, T, C]. ggml_group_norm treats ne[2]
            // as channels, so fold only H/T internally and restore the same layout.
            GGML_ASSERT(x->ne[3] == num_channels);
            const int64_t W = x->ne[0];
            const int64_t H = x->ne[1];
            const int64_t T = x->ne[2];
            x               = ggml_ext_cont(ctx->ggml_ctx, x);
            x               = ggml_reshape_4d(ctx->ggml_ctx, x, W, H * T, num_channels, 1);
            x               = ggml_group_norm(ctx->ggml_ctx, x, num_groups, eps);
            ggml_tensor* weight = params["weight"];
            ggml_tensor* bias   = params["bias"];
            if (ctx->weight_adapter) {
                weight = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, weight, prefix + "weight");
                bias   = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, bias, prefix + "bias");
            }
            weight = ggml_reshape_4d(ctx->ggml_ctx, weight, 1, 1, num_channels, 1);
            bias   = ggml_reshape_4d(ctx->ggml_ctx, bias, 1, 1, num_channels, 1);
            x      = ggml_mul_inplace(ctx->ggml_ctx, x, weight);
            x      = ggml_add_inplace(ctx->ggml_ctx, x, bias);
            return ggml_reshape_4d(ctx->ggml_ctx, x, W, H, T, num_channels);
        }
    };
    class ResBlock : public GGMLBlock {
    public:
        ResBlock(int64_t channels, int dims = 3) {
            GGML_ASSERT(dims == 3);
            blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv3d(channels, channels, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}));
            blocks["norm1"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, channels));
            blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv3d(channels, channels, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}));
            blocks["norm2"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, channels));
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
            auto conv1 = std::dynamic_pointer_cast<Conv3d>(blocks["conv1"]);
            auto norm1 = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["norm1"]);
            auto conv2 = std::dynamic_pointer_cast<Conv3d>(blocks["conv2"]);
            auto norm2 = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["norm2"]);
            ggml_tensor* residual = x;
            x = conv1->forward(ctx, x);
            x = norm1->forward(ctx, x);
            x = ggml_silu_inplace(ctx->ggml_ctx, x);
            x = conv2->forward(ctx, x);
            x = norm2->forward(ctx, x);
            x = ggml_add(ctx->ggml_ctx, x, residual);
            return ggml_silu(ctx->ggml_ctx, x);
        }
    };
    class PixelShuffleND : public UnaryBlock {
    protected:
        int upscale_factor;
    public:
        explicit PixelShuffleND(int upscale_factor)
            : upscale_factor(upscale_factor) {}
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
            GGML_ASSERT(upscale_factor == 2);
            int64_t h = x->ne[1];
            int64_t w = x->ne[0];
            // x: [b*f, c*4, h, w] -> [b*f, c, h*2, w*2]
            x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 2, 0, 1, 3));  // [b*f, h, w, c*4]
            x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0], x->ne[1] * x->ne[2], x->ne[3]);          // [b*f, h*w, c*4]
            return DiT::unpatchify(ctx->ggml_ctx, x, h, w, upscale_factor, upscale_factor, true);
        }
    };
    class LatentUpsampler : public GGMLBlock {
    public:
        LatentUpsamplerConfig config;
        explicit LatentUpsampler(LatentUpsamplerConfig config)
            : config(std::move(config)) {
            GGML_ASSERT(this->config.dims == 3);
            GGML_ASSERT(this->config.spatial_upsample);
            GGML_ASSERT(!this->config.temporal_upsample);
            GGML_ASSERT(!this->config.rational_resampler);
            blocks["initial_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(this->config.in_channels,
                                                                           this->config.mid_channels,
                                                                           {3, 3, 3},
                                                                           {1, 1, 1},
                                                                           {1, 1, 1}));
            blocks["initial_norm"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, this->config.mid_channels));
            for (int i = 0; i < this->config.num_blocks_per_stage; ++i) {
                blocks["res_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new ResBlock(this->config.mid_channels, this->config.dims));
            }
            blocks["upsampler.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(this->config.mid_channels,
                                                                          4 * this->config.mid_channels,
                                                                          {3, 3},
                                                                          {1, 1},
                                                                          {1, 1}));
            blocks["upsampler.1"] = std::shared_ptr<GGMLBlock>(new PixelShuffleND(2));
            for (int i = 0; i < this->config.num_blocks_per_stage; ++i) {
                blocks["post_upsample_res_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new ResBlock(this->config.mid_channels, this->config.dims));
            }
            blocks["final_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(this->config.mid_channels,
                                                                         this->config.in_channels,
                                                                         {3, 3, 3},
                                                                         {1, 1, 1},
                                                                         {1, 1, 1}));
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
            // x: [b*c, f, h, w]
            // return: [b*c, f, h*2, w*2]
            auto initial_conv  = std::dynamic_pointer_cast<Conv3d>(blocks["initial_conv"]);
            auto initial_norm  = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["initial_norm"]);
            auto upsample_conv = std::dynamic_pointer_cast<Conv2d>(blocks["upsampler.0"]);
            auto pixel_shuffle = std::dynamic_pointer_cast<PixelShuffleND>(blocks["upsampler.1"]);
            auto final_conv    = std::dynamic_pointer_cast<Conv3d>(blocks["final_conv"]);
            x = initial_conv->forward(ctx, x);
            x = initial_norm->forward(ctx, x);
            x = ggml_silu(ctx->ggml_ctx, x);
            sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.initial", "x");
            for (int i = 0; i < config.num_blocks_per_stage; ++i) {
                auto block = std::dynamic_pointer_cast<ResBlock>(blocks["res_blocks." + std::to_string(i)]);
                x          = block->forward(ctx, x);
                sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.res_blocks." + std::to_string(i), "x");
            }
            // rearrange(x, "b c f h w -> (b f) c h w"),
            x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));  // [b*f, c, h, w]
            x = upsample_conv->forward(ctx, x);                                                      // [b*f, c*4, h, w]
            x = pixel_shuffle->forward(ctx, x);                                                      // [b*f, c, h*2, w*2]
            x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));  // [b*c, f, h, w]
            sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.spatial_up", "x");
            for (int i = 0; i < config.num_blocks_per_stage; ++i) {
                auto block = std::dynamic_pointer_cast<ResBlock>(blocks["post_upsample_res_blocks." + std::to_string(i)]);
                x          = block->forward(ctx, x);
                sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.post_blocks." + std::to_string(i), "x");
            }
            x = final_conv->forward(ctx, x);
            sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.final", "x");
            return x;
        }
    };
    struct LatentUpsamplerRunner : public GGMLRunner {
        std::unique_ptr<LatentUpsampler> model;
        LatentUpsamplerRunner(ggml_backend_t backend,
                              ggml_backend_t params_backend)
            : GGMLRunner(backend, params_backend) {}
        std::string get_desc() override {
            return "ltx_latent_upsampler";
        }
        bool load_from_file(const std::string& file_path, int n_threads) {
            LOG_INFO("loading LTX latent upsampler from '%s'", file_path.c_str());
            ModelLoader model_loader;
            if (!model_loader.init_from_file(file_path)) {
                LOG_ERROR("init LTX latent upsampler model loader from file failed: '%s'", file_path.c_str());
                return false;
            }
            const auto& tensor_storage_map = model_loader.get_tensor_storage_map();
            if (!has_tensor(tensor_storage_map, "post_upsample_res_blocks.0.conv2.bias") ||
                !has_tensor(tensor_storage_map, "upsampler.0.weight")) {
                LOG_ERROR("unsupported LTX latent upsampler weights: expected spatial upsampler tensors");
                return false;
            }
            LatentUpsamplerConfig config = detect_config_from_weights(tensor_storage_map);
            if (config.dims != 3 || !config.spatial_upsample || config.temporal_upsample ||
                config.rational_resampler) {
                LOG_ERROR("unsupported LTX latent upsampler config: dims=%d spatial=%d temporal=%d rational=%d",
                          config.dims,
                          config.spatial_upsample,
                          config.temporal_upsample,
                          config.rational_resampler);
                return false;
            }
            model = std::make_unique<LatentUpsampler>(config);
            model->init(params_ctx, tensor_storage_map, "");
            if (!alloc_params_buffer()) {
                LOG_ERROR("LTX latent upsampler params buffer allocation failed");
                return false;
            }
            std::map<std::string, ggml_tensor*> tensors;
            model->get_param_tensors(tensors);
            if (!model_loader.load_tensors(tensors, {}, n_threads)) {
                LOG_ERROR("load LTX latent upsampler tensors failed");
                return false;
            }
            LOG_INFO("LTX latent upsampler loaded: in_channels=%" PRId64 ", mid_channels=%" PRId64 ", blocks=%d",
                     config.in_channels,
                     config.mid_channels,
                     config.num_blocks_per_stage);
            return true;
        }
        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
            if (!model) {
                return nullptr;
            }
            ggml_cgraph* gf  = new_graph_custom(LTX_UPSAMPLER_GRAPH_SIZE);
            ggml_tensor* x   = make_input(x_tensor);
            auto runner_ctx  = get_context();
            ggml_tensor* out = model->forward(&runner_ctx, x);
            ggml_build_forward_expand(gf, out);
            return gf;
        }
        sd::Tensor<float> compute(const int n_threads,
                                  const sd::Tensor<float>& x) {
            if (!model) {
                LOG_ERROR("LTX latent upsampler is not loaded");
                return {};
            }
            if (x.dim() != 4 && x.dim() != 5) {
                LOG_ERROR("LTX latent upsampler expects 4D or 5D video latent, got dim=%lld",
                          (long long)x.dim());
                return {};
            }
            if (x.dim() == 5 && x.shape()[4] != 1) {
                LOG_ERROR("LTX latent upsampler currently supports batch size 1, got batch=%lld",
                          (long long)x.shape()[4]);
                return {};
            }
            if (x.shape()[3] != model->config.in_channels) {
                LOG_ERROR("LTX latent upsampler expected %" PRId64 " channels, got %lld",
                          model->config.in_channels,
                          (long long)x.shape()[3]);
                return {};
            }
            size_t expected_dim = static_cast<size_t>(x.dim());
            auto get_graph      = [&]() -> ggml_cgraph* { return build_graph(x); };
            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), expected_dim);
        }
    };
 }  // namespace LTXVUpsampler
 #endif  // __SD_LTX_LATENT_UPSCALER_HPP__
--- a/src/ltx_vae.hpp
+++ b/src/ltx_vae.hpp
@ -1123,6 +1123,18 @@ namespace LTXVAE {
            mean      = ggml_cont(ctx->ggml_ctx, mean);
            return processor->normalize(ctx, mean);
        }
        ggml_tensor* normalize_latents(GGMLRunnerContext* ctx,
                                       ggml_tensor* x) {
            auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
            return processor->normalize(ctx, x);
        }
        ggml_tensor* un_normalize_latents(GGMLRunnerContext* ctx,
                                          ggml_tensor* x) {
            auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
            return processor->un_normalize(ctx, x);
        }
    };
 }  // namespace LTXVAE
@ -1192,6 +1204,17 @@ struct LTXVideoVAE : public VAE {
        return gf;
    }
    ggml_cgraph* build_latent_statistics_graph(const sd::Tensor<float>& z_tensor, bool normalize) {
        ggml_cgraph* gf = new_graph_custom(1024);
        ggml_tensor* z  = make_input(z_tensor);
        auto runner_ctx  = get_context();
        ggml_tensor* out = normalize ? vae.normalize_latents(&runner_ctx, z)
                                     : vae.un_normalize_latents(&runner_ctx, z);
        ggml_build_forward_expand(gf, out);
        return gf;
    }
    sd::Tensor<float> _compute(const int n_threads,
                               const sd::Tensor<float>& z,
                               bool decode_graph) override {
@ -1226,6 +1249,26 @@ struct LTXVideoVAE : public VAE {
        return result;
    }
    sd::Tensor<float> apply_latent_statistics(const int n_threads,
                                              const sd::Tensor<float>& z,
                                              bool normalize) {
        auto get_graph = [&]() -> ggml_cgraph* {
            return build_latent_statistics_graph(z, normalize);
        };
        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false),
                                               static_cast<size_t>(z.dim()));
    }
    sd::Tensor<float> normalize_latents(const int n_threads,
                                        const sd::Tensor<float>& z) {
        return apply_latent_statistics(n_threads, z, true);
    }
    sd::Tensor<float> un_normalize_latents(const int n_threads,
                                           const sd::Tensor<float>& z) {
        return apply_latent_statistics(n_threads, z, false);
    }
    int get_encoder_output_channels(int input_channels) override {
        SD_UNUSED(input_channels);
        return 256;
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -17,6 +17,7 @@
 #include "guidance.h"
 #include "lora.hpp"
 #include "ltx_audio_vae.h"
 #include "ltx_latent_upscaler.hpp"
 #include "ltx_vae.hpp"
 #include "pmid.hpp"
 #include "sample-cache.h"
@ -2198,6 +2199,24 @@ public:
        return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
    }
    sd::Tensor<float> normalize_ltx_video_latents(const sd::Tensor<float>& x) {
        auto ltx_vae = std::dynamic_pointer_cast<LTXVideoVAE>(first_stage_model);
        if (!ltx_vae) {
            LOG_ERROR("LTX latent normalization requires LTX video VAE");
            return {};
        }
        return ltx_vae->normalize_latents(n_threads, x);
    }
    sd::Tensor<float> un_normalize_ltx_video_latents(const sd::Tensor<float>& x) {
        auto ltx_vae = std::dynamic_pointer_cast<LTXVideoVAE>(first_stage_model);
        if (!ltx_vae) {
            LOG_ERROR("LTX latent un-normalization requires LTX video VAE");
            return {};
        }
        return ltx_vae->un_normalize_latents(n_threads, x);
    }
    sd::Tensor<float> decode_ltx_audio_latent(const sd::Tensor<float>& audio_latent) {
        if (audio_vae_model == nullptr || audio_latent.empty()) {
            return {};
@ -2464,16 +2483,18 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) {
 }
 void sd_hires_params_init(sd_hires_params_t* hires_params) {
-    *hires_params                    = {};
+    *hires_params                     = {};
-    hires_params->enabled            = false;
+    hires_params->enabled             = false;
-    hires_params->upscaler           = SD_HIRES_UPSCALER_LATENT;
+    hires_params->upscaler            = SD_HIRES_UPSCALER_LATENT;
-    hires_params->model_path         = nullptr;
+    hires_params->model_path          = nullptr;
-    hires_params->scale              = 2.0f;
+    hires_params->scale               = 2.0f;
-    hires_params->target_width       = 0;
+    hires_params->target_width        = 0;
-    hires_params->target_height      = 0;
+    hires_params->target_height       = 0;
-    hires_params->steps              = 0;
+    hires_params->steps               = 0;
-    hires_params->denoising_strength = 0.7f;
+    hires_params->denoising_strength  = 0.7f;
-    hires_params->upscale_tile_size  = 128;
+    hires_params->upscale_tile_size   = 128;
    hires_params->custom_sigmas       = nullptr;
    hires_params->custom_sigmas_count = 0;
 }
 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
@ -2746,6 +2767,16 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
    sd_vid_gen_params->moe_boundary                          = 0.875f;
    sd_vid_gen_params->vace_strength                         = 1.f;
    sd_vid_gen_params->vae_tiling_params                     = {false, false, 0, 0, 0.5f, 0.0f, 0.0f};
    sd_vid_gen_params->hires.enabled                         = false;
    sd_vid_gen_params->hires.upscaler                        = SD_HIRES_UPSCALER_LATENT;
    sd_vid_gen_params->hires.scale                           = 2.f;
    sd_vid_gen_params->hires.target_width                    = 0;
    sd_vid_gen_params->hires.target_height                   = 0;
    sd_vid_gen_params->hires.steps                           = 0;
    sd_vid_gen_params->hires.denoising_strength              = 0.7f;
    sd_vid_gen_params->hires.upscale_tile_size               = 128;
    sd_vid_gen_params->hires.custom_sigmas                   = nullptr;
    sd_vid_gen_params->hires.custom_sigmas_count             = 0;
    sd_cache_params_init(&sd_vid_gen_params->cache);
 }
@ -2995,6 +3026,7 @@ struct GenerationRequest {
        vace_strength               = sd_vid_gen_params->vace_strength;
        guidance                    = sd_vid_gen_params->sample_params.guidance;
        high_noise_guidance         = sd_vid_gen_params->high_noise_sample_params.guidance;
        hires                       = sd_vid_gen_params->hires;
        resolve(sd_ctx);
        if (frames != requested_frames) {
            LOG_WARN("align video frames from %d to %d for %s",
@ -3053,6 +3085,20 @@ struct GenerationRequest {
            hires.enabled = false;
            return;
        }
        if (hires.custom_sigmas_count < 0) {
            LOG_WARN("hires custom sigmas count is negative, ignoring custom sigmas");
            hires.custom_sigmas       = nullptr;
            hires.custom_sigmas_count = 0;
        }
        if (hires.custom_sigmas_count > 0 && hires.custom_sigmas == nullptr) {
            LOG_WARN("hires custom sigmas count is positive but custom sigmas are null, ignoring custom sigmas");
            hires.custom_sigmas_count = 0;
        }
        if (hires.custom_sigmas_count == 1) {
            LOG_WARN("hires custom sigmas requires at least two values, ignoring custom sigmas");
            hires.custom_sigmas       = nullptr;
            hires.custom_sigmas_count = 0;
        }
        hires.denoising_strength = std::clamp(hires.denoising_strength, 0.0001f, 1.f);
        hires.steps              = std::max(0, hires.steps);
@ -3417,6 +3463,85 @@ static sd::Tensor<float> pack_ltxav_audio_and_video_denoise_mask(const sd::Tenso
    return sd::ops::concat(video_mask_full, audio_mask, 3);
 }
 static sd::Tensor<float> make_ltxav_video_denoise_mask(const sd::Tensor<float>& video_latent, float value = 1.f) {
    if (video_latent.empty()) {
        return {};
    }
    return sd::full<float>({video_latent.shape()[0],
                            video_latent.shape()[1],
                            video_latent.shape()[2],
                            1,
                            1},
                           value);
 }
 static sd::Tensor<float> encode_ltxav_condition_image(sd_ctx_t* sd_ctx,
                                                      const sd::Tensor<float>& image,
                                                      const char* name) {
    if (sd_ctx == nullptr || sd_ctx->sd == nullptr || image.empty()) {
        return {};
    }
    auto condition_image  = image.reshape({image.shape()[0],
                                           image.shape()[1],
                                           1,
                                           image.shape()[2],
                                           image.shape()[3]});
    auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
    if (condition_latent.empty()) {
        LOG_ERROR("failed to encode LTXAV %s image", name);
    }
    return condition_latent;
 }
 static bool apply_ltxav_condition_by_latent_index(sd::Tensor<float>* video_latent,
                                                  sd::Tensor<float>* video_mask,
                                                  const sd::Tensor<float>& condition_latent,
                                                  int64_t latent_idx,
                                                  const char* name,
                                                  float conditioned_mask) {
    if (video_latent == nullptr || video_mask == nullptr || video_latent->empty() || video_mask->empty()) {
        return false;
    }
    if (condition_latent.empty() ||
        condition_latent.shape()[0] != video_latent->shape()[0] ||
        condition_latent.shape()[1] != video_latent->shape()[1] ||
        condition_latent.shape()[3] != video_latent->shape()[3]) {
        LOG_ERROR("invalid LTXAV %s condition latent shape", name);
        return false;
    }
    int64_t latent_frames    = video_latent->shape()[2];
    int64_t condition_frames = condition_latent.shape()[2];
    if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
        LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
                  name,
                  latent_idx,
                  condition_frames,
                  latent_frames);
        return false;
    }
    sd::ops::slice_assign(video_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
    sd::ops::fill_slice(video_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
    return true;
 }
 static bool apply_ltxav_condition_image_by_latent_index(sd_ctx_t* sd_ctx,
                                                        const sd::Tensor<float>& image,
                                                        sd::Tensor<float>* video_latent,
                                                        sd::Tensor<float>* video_mask,
                                                        int64_t latent_idx,
                                                        const char* name,
                                                        float strength) {
    auto condition_latent = encode_ltxav_condition_image(sd_ctx, image, name);
    return !condition_latent.empty() &&
           apply_ltxav_condition_by_latent_index(video_latent,
                                                 video_mask,
                                                 condition_latent,
                                                 latent_idx,
                                                 name,
                                                 1.0f - std::clamp(strength, 0.f, 1.f));
 }
 static sd::Tensor<float> unpack_ltxav_audio_latent(const sd::Tensor<float>& packed_latent,
                                                   int audio_length,
                                                   int video_channels) {
@ -3978,6 +4103,53 @@ static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
    return {};
 }
 static std::vector<float> make_hires_sigma_schedule(sd_ctx_t* sd_ctx,
                                                    const sd_hires_params_t& hires,
                                                    const sd_sample_params_t& sample_params,
                                                    sample_method_t sample_method,
                                                    int default_steps,
                                                    int sample_seq_len,
                                                    int* scheduler_steps_out) {
    if (scheduler_steps_out != nullptr) {
        *scheduler_steps_out = 0;
    }
    if (hires.custom_sigmas_count > 0 && hires.custom_sigmas != nullptr) {
        std::vector<float> custom_sigmas(hires.custom_sigmas,
                                         hires.custom_sigmas + hires.custom_sigmas_count);
        if (scheduler_steps_out != nullptr) {
            *scheduler_steps_out = static_cast<int>(custom_sigmas.size()) - 1;
        }
        return custom_sigmas;
    }
    int effective_steps = hires.steps > 0 ? hires.steps : default_steps;
    effective_steps     = std::max(1, effective_steps);
    // sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps,
    // unlike img2img which trims from a fixed step count.
    int scheduler_steps = static_cast<int>(effective_steps / hires.denoising_strength);
    scheduler_steps     = std::max(1, scheduler_steps);
    scheduler_t scheduler     = resolve_scheduler(sd_ctx,
                                                  sample_params.scheduler,
                                                  sample_method);
    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(scheduler_steps,
                                                                 sample_seq_len,
                                                                 scheduler,
                                                                 sd_ctx->sd->version,
                                                                 sample_params.extra_sample_args);
    size_t t_enc              = static_cast<size_t>(scheduler_steps * hires.denoising_strength);
    if (t_enc >= static_cast<size_t>(scheduler_steps)) {
        t_enc = static_cast<size_t>(scheduler_steps) - 1;
    }
    if (scheduler_steps_out != nullptr) {
        *scheduler_steps_out = scheduler_steps;
    }
    return std::vector<float>(sigmas.begin() + scheduler_steps - static_cast<int>(t_enc) - 1,
                              sigmas.end());
 }
 SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
    if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
        return nullptr;
@ -4100,29 +4272,20 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
            }
        }
-        int hires_steps = request.hires.steps > 0 ? request.hires.steps : plan.sample_steps;
+        int hires_scheduler_steps = 0;
-
+        std::vector<float> hires_sigma_sched =
-        // sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps,
+            make_hires_sigma_schedule(sd_ctx,
-        // unlike img2img which trims from a fixed step count
+                                      request.hires,
-        hires_steps = static_cast<int>(hires_steps / request.hires.denoising_strength);
+                                      sd_img_gen_params->sample_params,
-
+                                      plan.sample_method,
-        std::vector<float> hires_sigmas = sd_ctx->sd->denoiser->get_sigmas(
+                                      plan.sample_steps,
-            hires_steps,
+                                      sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width),
-            sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width),
+                                      &hires_scheduler_steps);
-            sd_img_gen_params->sample_params.scheduler,
+        LOG_INFO("hires fix: scheduler_steps=%d, denoising_strength=%.2f, sigma_sched_size=%zu%s",
-            sd_ctx->sd->version,
+                 hires_scheduler_steps,
            sd_img_gen_params->sample_params.extra_sample_args);
        size_t t_enc = static_cast<size_t>(hires_steps * request.hires.denoising_strength);
        if (t_enc >= static_cast<size_t>(hires_steps)) {
            t_enc = static_cast<size_t>(hires_steps) - 1;
        }
        std::vector<float> hires_sigma_sched(hires_sigmas.begin() + hires_steps - static_cast<int>(t_enc) - 1,
                                             hires_sigmas.end());
        LOG_INFO("hires fix: %d steps, denoising_strength=%.2f, sigma_sched_size=%zu",
                 hires_steps,
                 request.hires.denoising_strength,
-                 hires_sigma_sched.size());
+                 hires_sigma_sched.size(),
                 request.hires.custom_sigmas_count > 0 ? ", custom_sigmas=true" : "");
        std::vector<sd::Tensor<float>> hires_final_latents;
        int64_t hires_denoise_start = ggml_time_ms();
@ -4270,44 +4433,7 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
            float conditioning_strength = std::clamp(request->strength, 0.f, 1.f);
            float conditioned_mask      = 1.0f - conditioning_strength;
-            latents.denoise_mask        = sd::full<float>({latents.init_latent.shape()[0],
+            latents.denoise_mask        = make_ltxav_video_denoise_mask(latents.init_latent, 1.f);
                                                           latents.init_latent.shape()[1],
                                                           latents.init_latent.shape()[2],
                                                           1,
                                                           1},
                                                   1.f);
            auto encode_ltxav_condition_image = [&](const sd::Tensor<float>& image, const char* name) -> sd::Tensor<float> {
                auto condition_image  = image.reshape({image.shape()[0],
                                                       image.shape()[1],
                                                       1,
                                                       image.shape()[2],
                                                       image.shape()[3]});
                auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
                if (condition_latent.empty()) {
                    LOG_ERROR("failed to encode LTXAV %s image", name);
                }
                return condition_latent;
            };
            auto apply_video_condition_by_latent_index = [&](const sd::Tensor<float>& condition_latent,
                                                             int64_t latent_idx,
                                                             const char* name) -> bool {
                int64_t latent_frames    = latents.init_latent.shape()[2];
                int64_t condition_frames = condition_latent.shape()[2];
                if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
                    LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
                              name,
                              latent_idx,
                              condition_frames,
                              latent_frames);
                    return false;
                }
                sd::ops::slice_assign(&latents.init_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
                sd::ops::fill_slice(&latents.denoise_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
                return true;
            };
            auto apply_video_condition_by_keyframe_index = [&](const sd::Tensor<float>& keyframes,
                                                               int frame_idx,
@ -4345,20 +4471,30 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
            };
            if (!start_image.empty()) {
-                auto start_image_latent = encode_ltxav_condition_image(start_image, "init");
+                if (!apply_ltxav_condition_image_by_latent_index(sd_ctx,
-                if (start_image_latent.empty() || !apply_video_condition_by_latent_index(start_image_latent, 0, "init")) {
+                                                                 start_image,
                                                                 &latents.init_latent,
                                                                 &latents.denoise_mask,
                                                                 0,
                                                                 "init",
                                                                 conditioning_strength)) {
                    return std::nullopt;
                }
            }
            if (!end_image.empty()) {
-                auto end_image_latent = encode_ltxav_condition_image(end_image, "end");
+                auto end_image_latent = encode_ltxav_condition_image(sd_ctx, end_image, "end");
                if (end_image_latent.empty()) {
                    return std::nullopt;
                }
                int frame_idx = request->frames - 1;
-                bool ok       = frame_idx == 0 ? apply_video_condition_by_latent_index(end_image_latent, 0, "end")
+                bool ok       = frame_idx == 0 ? apply_ltxav_condition_by_latent_index(&latents.init_latent,
                                                                                       &latents.denoise_mask,
                                                                                       end_image_latent,
                                                                                       0,
                                                                                       "end",
                                                                                       conditioned_mask)
                                               : apply_video_condition_by_keyframe_index(end_image_latent, frame_idx, "end");
                if (!ok) {
                    return std::nullopt;
@ -4639,6 +4775,175 @@ static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
    return result_images;
 }
 static sd::Tensor<float> upscale_ltx_spatial_video_latent(sd_ctx_t* sd_ctx,
                                                          const char* model_path,
                                                          const sd::Tensor<float>& packed_latent,
                                                          int audio_length) {
    if (sd_ctx == nullptr || sd_ctx->sd == nullptr || packed_latent.empty()) {
        return {};
    }
    if (strlen(SAFE_STR(model_path)) == 0) {
        LOG_ERROR("LTX latent spatial upscale requires a model path");
        return {};
    }
    if (!sd_ctx->sd->ensure_backend_pair(SDBackendModule::UPSCALER)) {
        return {};
    }
    int latent_channels            = sd_ctx->sd->get_latent_channel();
    sd::Tensor<float> video_latent = packed_latent;
    sd::Tensor<float> audio_latent;
    if (packed_latent.shape()[3] > latent_channels) {
        video_latent = sd::ops::slice(packed_latent, 3, 0, latent_channels);
        audio_latent = unpack_ltxav_audio_latent(packed_latent, audio_length, latent_channels);
    }
    LOG_INFO("LTX latent spatial upscale: latent %dx%dx%dx%d -> x2",
             (int)video_latent.shape()[0],
             (int)video_latent.shape()[1],
             (int)video_latent.shape()[2],
             (int)video_latent.shape()[3]);
    sd::Tensor<float> unnormalized = sd_ctx->sd->un_normalize_ltx_video_latents(video_latent);
    if (unnormalized.empty()) {
        LOG_ERROR("LTX latent un-normalization failed before spatial upscale");
        return {};
    }
    std::unique_ptr<LTXVUpsampler::LatentUpsamplerRunner> upsampler =
        std::make_unique<LTXVUpsampler::LatentUpsamplerRunner>(sd_ctx->sd->backend_for(SDBackendModule::UPSCALER),
                                                               sd_ctx->sd->params_backend_for(SDBackendModule::UPSCALER));
    const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram);
    upsampler->set_max_graph_vram_bytes(max_graph_vram_bytes);
    if (!upsampler->load_from_file(model_path, sd_ctx->sd->n_threads)) {
        LOG_ERROR("load LTX latent upsampler failed");
        return {};
    }
    sd::Tensor<float> upscaled = upsampler->compute(sd_ctx->sd->n_threads, unnormalized);
    upsampler.reset();
    if (upscaled.empty()) {
        LOG_ERROR("LTX latent spatial upscale failed");
        return {};
    }
    upscaled = sd_ctx->sd->normalize_ltx_video_latents(upscaled);
    if (upscaled.empty()) {
        LOG_ERROR("LTX latent normalization failed after spatial upscale");
        return {};
    }
    if (!audio_latent.empty()) {
        upscaled = pack_ltxav_audio_and_video_latents(upscaled, audio_latent);
    }
    return upscaled;
 }
 static bool apply_ltxv_refine_image_conditioning(sd_ctx_t* sd_ctx,
                                                 const sd_vid_gen_params_t* sd_vid_gen_params,
                                                 const GenerationRequest& request,
                                                 const ImageGenerationLatents& latents,
                                                 sd::Tensor<float>* latent,
                                                 sd::Tensor<float>* denoise_mask,
                                                 sd::Tensor<float>* video_positions) {
    if (sd_ctx == nullptr || sd_ctx->sd == nullptr || sd_vid_gen_params == nullptr ||
        latent == nullptr || latent->empty() || denoise_mask == nullptr || video_positions == nullptr) {
        return true;
    }
    if (sd_vid_gen_params->init_image.data == nullptr &&
        sd_vid_gen_params->end_image.data == nullptr) {
        return true;
    }
    if (sd_ctx->sd->vae_decode_only) {
        LOG_ERROR("LTXV refine image conditioning requires VAE encoder weights; create the context with vae_decode_only=false");
        return false;
    }
    constexpr float conditioning_strength = 1.f;
    int latent_channels                   = sd_ctx->sd->get_latent_channel();
    sd::Tensor<float> video_latent        = *latent;
    sd::Tensor<float> audio_latent;
    if (latent->shape()[3] > latent_channels) {
        video_latent = sd::ops::slice(*latent, 3, 0, latent_channels);
        audio_latent = unpack_ltxav_audio_latent(*latent, latents.audio_length, latent_channels);
        if (audio_latent.empty()) {
            LOG_ERROR("failed to unpack LTXAV audio latent before image-to-video inplace conditioning");
            return false;
        }
    }
    int image_width              = static_cast<int>(video_latent.shape()[0]) * request.vae_scale_factor;
    int image_height             = static_cast<int>(video_latent.shape()[1]) * request.vae_scale_factor;
    sd::Tensor<float> video_mask = make_ltxav_video_denoise_mask(video_latent, 1.f);
    if (sd_vid_gen_params->init_image.data != nullptr) {
        sd::Tensor<float> start_image = sd_image_to_tensor(sd_vid_gen_params->init_image, image_width, image_height);
        if (!apply_ltxav_condition_image_by_latent_index(sd_ctx,
                                                         start_image,
                                                         &video_latent,
                                                         &video_mask,
                                                         0,
                                                         "init",
                                                         conditioning_strength)) {
            return false;
        }
    }
    if (sd_vid_gen_params->end_image.data != nullptr) {
        sd::Tensor<float> end_image        = sd_image_to_tensor(sd_vid_gen_params->end_image, image_width, image_height);
        sd::Tensor<float> end_image_latent = encode_ltxav_condition_image(sd_ctx, end_image, "end");
        if (end_image_latent.empty()) {
            return false;
        }
        int frame_idx = request.frames - 1;
        if (frame_idx == 0) {
            if (!apply_ltxav_condition_by_latent_index(&video_latent,
                                                       &video_mask,
                                                       end_image_latent,
                                                       0,
                                                       "end",
                                                       1.f - conditioning_strength)) {
                return false;
            }
        } else {
            if (latents.video_conditioning_frame_count <= 0 || latents.video_target_frame_count <= 0) {
                LOG_ERROR("LTXV FLF2V refine conditioning requires low-resolution keyframe conditioning metadata");
                return false;
            }
            int64_t target_latent_frames = latents.video_target_frame_count;
            if (!apply_ltxav_condition_by_latent_index(&video_latent,
                                                       &video_mask,
                                                       end_image_latent,
                                                       target_latent_frames,
                                                       "end",
                                                       1.f - conditioning_strength)) {
                return false;
            }
            *video_positions = build_ltxv_video_positions(video_latent.shape()[0],
                                                          video_latent.shape()[1],
                                                          target_latent_frames,
                                                          end_image_latent.shape()[2],
                                                          frame_idx,
                                                          1,
                                                          request.fps,
                                                          request.vae_scale_factor,
                                                          8,
                                                          true);
        }
    }
    if (!audio_latent.empty()) {
        *latent       = pack_ltxav_audio_and_video_latents(video_latent, audio_latent);
        *denoise_mask = pack_ltxav_audio_and_video_denoise_mask(video_mask, video_latent, audio_latent);
    } else {
        *latent       = std::move(video_latent);
        *denoise_mask = std::move(video_mask);
    }
    LOG_INFO("LTXV refine image conditioning applied at %dx%d", image_width, image_height);
    return true;
 }
 SD_API bool generate_video(sd_ctx_t* sd_ctx,
                           const sd_vid_gen_params_t* sd_vid_gen_params,
                           sd_image_t** frames_out,
@ -4659,6 +4964,23 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
    int64_t t0                    = ggml_time_ms();
    sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params;
    GenerationRequest request(sd_ctx, sd_vid_gen_params);
    bool latent_upscale_enabled     = request.hires.enabled;
    GenerationRequest hires_request = request;
    if (latent_upscale_enabled) {
        if (!sd_version_is_ltxav(sd_ctx->sd->version)) {
            LOG_ERROR("LTX latent spatial upscale is only supported for LTX video models");
            return false;
        }
        if (request.hires.upscaler != SD_HIRES_UPSCALER_MODEL) {
            LOG_ERROR("LTX latent spatial upscale currently requires hires upscaler MODEL");
            return false;
        }
        if (strlen(SAFE_STR(request.hires.model_path)) == 0) {
            LOG_ERROR("LTX latent spatial upscale is enabled but hires model path was not provided");
            return false;
        }
    }
    sd_ctx->sd->rng->manual_seed(request.seed);
    sd_ctx->sd->sampler_rng->manual_seed(request.seed);
    sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
@ -4670,14 +4992,22 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
        return false;
    }
    ImageGenerationLatents latents = std::move(*latent_inputs_opt);
-    ImageGenerationEmbeds embeds   = prepare_video_generation_embeds(sd_ctx,
+
-                                                                     sd_vid_gen_params,
+    ImageGenerationEmbeds embeds = prepare_video_generation_embeds(sd_ctx,
-                                                                     request,
+                                                                   sd_vid_gen_params,
-                                                                     latents);
+                                                                   request,
-    LOG_INFO("generate_video %dx%dx%d",
+                                                                   latents);
-             request.width,
+    if (latent_upscale_enabled) {
-             request.height,
+        LOG_INFO("generate_video %dx%dx%d -> LTX latent spatial upscale",
-             request.frames);
+                 request.width,
                 request.height,
                 request.frames);
    } else {
        LOG_INFO("generate_video %dx%dx%d",
                 request.width,
                 request.height,
                 request.frames);
    }
    int64_t latent_start = ggml_time_ms();
    int W                = request.width / request.vae_scale_factor;
@ -4769,15 +5099,126 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
                                                        latents.video_positions);
    int64_t sampling_end = ggml_time_ms();
    if (sd_ctx->sd->free_params_immediately) {
        sd_ctx->sd->diffusion_model->free_params_buffer();
    }
    if (final_latent.empty()) {
        if (sd_ctx->sd->free_params_immediately) {
            sd_ctx->sd->diffusion_model->free_params_buffer();
        }
        LOG_ERROR("sampling failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
        return false;
    }
    LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
    if (latent_upscale_enabled) {
        int64_t upscale_start             = ggml_time_ms();
        sd::Tensor<float> upscaled_latent = upscale_ltx_spatial_video_latent(sd_ctx,
                                                                             request.hires.model_path,
                                                                             final_latent,
                                                                             latents.audio_length);
        int64_t upscale_end               = ggml_time_ms();
        if (upscaled_latent.empty()) {
            if (sd_ctx->sd->free_params_immediately) {
                sd_ctx->sd->diffusion_model->free_params_buffer();
            }
            return false;
        }
        LOG_INFO("LTX latent spatial upscale completed, taking %.2fs",
                 (upscale_end - upscale_start) * 1.0f / 1000);
        x_t                  = std::move(upscaled_latent);
        hires_request.width  = static_cast<int>(x_t.shape()[0]) * hires_request.vae_scale_factor;
        hires_request.height = static_cast<int>(x_t.shape()[1]) * hires_request.vae_scale_factor;
        if ((request.hires.target_width > 0 || request.hires.target_height > 0) &&
            (request.hires.target_width != hires_request.width || request.hires.target_height != hires_request.height)) {
            LOG_WARN("LTX latent spatial upsampler output is %dx%d; ignoring hires target %dx%d",
                     hires_request.width,
                     hires_request.height,
                     request.hires.target_width,
                     request.hires.target_height);
        }
        sd::Tensor<float> hires_denoise_mask;
        sd::Tensor<float> hires_video_positions;
        if (!apply_ltxv_refine_image_conditioning(sd_ctx,
                                                  sd_vid_gen_params,
                                                  hires_request,
                                                  latents,
                                                  &x_t,
                                                  &hires_denoise_mask,
                                                  &hires_video_positions)) {
            if (sd_ctx->sd->free_params_immediately) {
                sd_ctx->sd->diffusion_model->free_params_buffer();
            }
            return false;
        }
        noise = sd::Tensor<float>::randn_like(x_t, sd_ctx->sd->rng);
        W                                   = hires_request.width / hires_request.vae_scale_factor;
        H                                   = hires_request.height / hires_request.vae_scale_factor;
        T                                   = static_cast<int>(x_t.shape()[2]);
        sample_method_t hires_sample_method = plan.sample_method;
        int hires_scheduler_steps           = 0;
        std::vector<float> hires_sigma_sched =
            make_hires_sigma_schedule(sd_ctx,
                                      request.hires,
                                      sd_vid_gen_params->sample_params,
                                      hires_sample_method,
                                      plan.sample_steps,
                                      sd_ctx->sd->get_image_seq_len(hires_request.height, hires_request.width) * T,
                                      &hires_scheduler_steps);
        float hires_eta = resolve_eta(sd_ctx,
                                      sd_vid_gen_params->sample_params.eta,
                                      hires_sample_method);
        LOG_DEBUG("sample(latent upscale) %dx%dx%d", W, H, T);
        LOG_INFO("LTX latent spatial upscale refine: scheduler_steps=%d, denoising_strength=%.2f, sampler=%s, sigma_sched_size=%zu%s",
                 hires_scheduler_steps,
                 request.hires.denoising_strength,
                 sampling_methods_str[hires_sample_method],
                 hires_sigma_sched.size(),
                 request.hires.custom_sigmas_count > 0 ? ", custom_sigmas=true" : "");
        sampling_start = ggml_time_ms();
        final_latent   = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model,
                                            true,
                                            x_t,
                                            std::move(noise),
                                            embeds.cond,
                                          hires_request.use_uncond ? embeds.uncond : SDCondition(),
                                            embeds.img_cond,
                                            embeds.id_cond,
                                            sd::Tensor<float>(),
                                            0.f,
                                            sd_vid_gen_params->sample_params.guidance,
                                            hires_eta,
                                            sd_vid_gen_params->sample_params.shifted_timestep,
                                            hires_sample_method,
                                            sd_ctx->sd->is_flow_denoiser(),
                                            plan.extra_sample_args,
                                            hires_sigma_sched,
                                            -1,
                                            std::vector<sd::Tensor<float>>{},
                                            false,
                                            hires_denoise_mask,
                                            sd::Tensor<float>(),
                                            hires_request.vace_strength,
                                            latents.audio_length,
                                            static_cast<float>(hires_request.fps),
                                            hires_request.cache_params,
                                            hires_video_positions);
        sampling_end   = ggml_time_ms();
        if (sd_ctx->sd->free_params_immediately) {
            sd_ctx->sd->diffusion_model->free_params_buffer();
        }
        if (final_latent.empty()) {
            LOG_ERROR("sampling(latent upscale) failed after %.2fs",
                      (sampling_end - sampling_start) * 1.0f / 1000);
            return false;
        }
        LOG_INFO("sampling(latent upscale) completed, taking %.2fs",
                 (sampling_end - sampling_start) * 1.0f / 1000);
    } else if (sd_ctx->sd->free_params_immediately) {
        sd_ctx->sd->diffusion_model->free_params_buffer();
    }
    sd_audio_t* generated_audio = nullptr;
    if (sd_version_is_ltxav(sd_ctx->sd->version) &&
        latents.audio_length > 0 &&
@ -4808,7 +5249,7 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
    int64_t latent_end = ggml_time_ms();
    LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000);
-    auto result = decode_video_outputs(sd_ctx, request, final_latent, num_frames_out);
+    auto result = decode_video_outputs(sd_ctx, latent_upscale_enabled ? hires_request : request, final_latent, num_frames_out);
    if (result == nullptr) {
        free_sd_audio(generated_audio);
        return false;