feat: add LTX spatial latent upscale hires support (#1533)

This commit is contained in:
leejet 2026-05-20 22:27:09 +08:00 committed by GitHub
parent bdd937f29a
commit b3374e6a71
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 1073 additions and 137 deletions

BIN
assets/ltx2/hires_i2v.webm Normal file

Binary file not shown.

View File

@ -13,6 +13,8 @@
- safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/vae - safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/vae
- Download audio vae - Download audio vae
- safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/vae - safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/vae
- Download LTX spatial latent upscaler
- safetensors: https://huggingface.co/Lightricks/LTX-2.3/resolve/main/ltx-2.3-spatial-upscaler-x2-1.1.safetensors
## Examples ## Examples
@ -50,4 +52,26 @@
src="../assets/ltx2/flf2v.webm" src="../assets/ltx2/flf2v.webm"
controls controls
muted muted
style="max-width: 100%; height: auto;"></video>
### LTX-2.3 spatial latent upscale
LTX spatial latent upscale runs a model-backed x2 latent upsampler between the low-resolution video pass and the high-resolution refine pass. `-W` and `-H` are the pre-upscale generation size; the spatial upsampler produces x2 latent dimensions.
Put `ltx-2.3-spatial-upscaler-x2-1.1.safetensors` under the directory passed to `--hires-upscalers-dir`, then use the model name without path or extension in `--hires-upscaler`.
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors --hires-upscalers-dir ..\..\ComfyUI\models\latent_upscale_models --hires-upscaler ltx-2.3-spatial-upscaler-x2-1.1 --hires --hires-steps 4 -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -W 640 -H 360 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\ernie_image\turbo_example.png -o hires_i2v.webm
```
By default, the hires refine pass uses the main sampler and scheduler, then trims the second-pass sigma schedule by `--hires-denoising-strength` (`0.7` by default). To reproduce a ComfyUI-style explicit refine schedule, pass custom hires sigmas:
```
--hires-sigmas "0.85,0.725,0.421875,0.0"
```
<video
src="../assets/ltx2/hires_i2v.webm"
controls
muted
style="max-width: 100%; height: auto;"></video> style="max-width: 100%; height: auto;"></video>

View File

@ -176,6 +176,8 @@ Generation Options:
model-specific model-specific
--sigmas custom sigma values for the sampler, comma-separated (e.g., --sigmas custom sigma values for the sampler, comma-separated (e.g.,
"14.61,7.8,3.5,0.0"). "14.61,7.8,3.5,0.0").
--hires-sigmas custom sigma values for the highres fix second pass, comma-separated (e.g.,
"0.85,0.725,0.421875,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9]) --skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) -r, --ref-image reference image for Flux Kontext models (can be used multiple times)

View File

@ -1134,11 +1134,11 @@ ArgOptions SDGenerationParams::get_options() {
return 1; return 1;
}; };
auto on_sigmas_arg = [&](int argc, const char** argv, int index) { auto parse_sigmas_arg = [&](const char* value, std::vector<float>* target, const char* option_name) {
if (++index >= argc) { if (target == nullptr || value == nullptr) {
return -1; return -1;
} }
std::string sigmas_str = argv[index]; std::string sigmas_str = value;
if (!sigmas_str.empty() && sigmas_str.front() == '[') { if (!sigmas_str.empty() && sigmas_str.front() == '[') {
sigmas_str.erase(0, 1); sigmas_str.erase(0, 1);
} }
@ -1146,6 +1146,7 @@ ArgOptions SDGenerationParams::get_options() {
sigmas_str.pop_back(); sigmas_str.pop_back();
} }
size_t before = target->size();
std::stringstream ss(sigmas_str); std::stringstream ss(sigmas_str);
std::string item; std::string item;
while (std::getline(ss, item, ',')) { while (std::getline(ss, item, ',')) {
@ -1153,24 +1154,38 @@ ArgOptions SDGenerationParams::get_options() {
item.erase(item.find_last_not_of(" \t\n\r\f\v") + 1); item.erase(item.find_last_not_of(" \t\n\r\f\v") + 1);
if (!item.empty()) { if (!item.empty()) {
try { try {
custom_sigmas.push_back(std::stof(item)); target->push_back(std::stof(item));
} catch (const std::invalid_argument&) { } catch (const std::invalid_argument&) {
LOG_ERROR("error: invalid float value '%s' in --sigmas", item.c_str()); LOG_ERROR("error: invalid float value '%s' in %s", item.c_str(), option_name);
return -1; return -1;
} catch (const std::out_of_range&) { } catch (const std::out_of_range&) {
LOG_ERROR("error: float value '%s' out of range in --sigmas", item.c_str()); LOG_ERROR("error: float value '%s' out of range in %s", item.c_str(), option_name);
return -1; return -1;
} }
} }
} }
if (custom_sigmas.empty() && !sigmas_str.empty()) { if (target->size() == before && !sigmas_str.empty()) {
LOG_ERROR("error: could not parse any sigma values from '%s'", argv[index]); LOG_ERROR("error: could not parse any sigma values from '%s'", value);
return -1; return -1;
} }
return 1; return 1;
}; };
auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
}
return parse_sigmas_arg(argv[index], &custom_sigmas, "--sigmas");
};
auto on_hires_sigmas_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
}
return parse_sigmas_arg(argv[index], &hires_custom_sigmas, "--hires-sigmas");
};
auto on_ref_image_arg = [&](int argc, const char** argv, int index) { auto on_ref_image_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) { if (++index >= argc) {
return -1; return -1;
@ -1293,6 +1308,10 @@ ArgOptions SDGenerationParams::get_options() {
"--sigmas", "--sigmas",
"custom sigma values for the sampler, comma-separated (e.g., \"14.61,7.8,3.5,0.0\").", "custom sigma values for the sampler, comma-separated (e.g., \"14.61,7.8,3.5,0.0\").",
on_sigmas_arg}, on_sigmas_arg},
{"",
"--hires-sigmas",
"custom sigma values for the highres fix second pass, comma-separated (e.g., \"0.85,0.725,0.421875,0.0\").",
on_hires_sigmas_arg},
{"", {"",
"--skip-layers", "--skip-layers",
"layers to skip for SLG steps (default: [7,8,9])", "layers to skip for SLG steps (default: [7,8,9])",
@ -1525,11 +1544,31 @@ static bool resolve_model_file_from_dir(const std::string& model_name,
LOG_ERROR("%s directory is empty", label); LOG_ERROR("%s directory is empty", label);
return false; return false;
} }
auto ends_with_valid_ext = [&]() {
for (const auto& ext : valid_ext) {
if (model_name.size() < ext.size()) {
continue;
}
auto suffix = model_name.substr(model_name.size() - ext.size());
std::transform(suffix.begin(), suffix.end(), suffix.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
std::string lower_ext = ext;
std::transform(lower_ext.begin(), lower_ext.end(), lower_ext.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
if (suffix == lower_ext) {
return true;
}
}
return false;
};
if (model_name.empty() || if (model_name.empty() ||
model_name.find('/') != std::string::npos || model_name.find('/') != std::string::npos ||
model_name.find('\\') != std::string::npos || model_name.find('\\') != std::string::npos ||
fs::path(model_name).has_root_path() || fs::path(model_name).has_root_path() ||
fs::path(model_name).has_extension()) { ends_with_valid_ext()) {
LOG_ERROR("%s must be a model name without path or extension: %s", label, model_name.c_str()); LOG_ERROR("%s must be a model name without path or extension: %s", label, model_name.c_str());
return false; return false;
} }
@ -1633,6 +1672,9 @@ bool SDGenerationParams::from_json_str(
if (hires_json.contains("denoising_strength") && hires_json["denoising_strength"].is_number()) { if (hires_json.contains("denoising_strength") && hires_json["denoising_strength"].is_number()) {
hires_denoising_strength = hires_json["denoising_strength"]; hires_denoising_strength = hires_json["denoising_strength"];
} }
if (hires_json.contains("custom_sigmas") && hires_json["custom_sigmas"].is_array()) {
hires_custom_sigmas = hires_json["custom_sigmas"].get<std::vector<float>>();
}
if (hires_json.contains("upscale_tile_size") && hires_json["upscale_tile_size"].is_number_integer()) { if (hires_json.contains("upscale_tile_size") && hires_json["upscale_tile_size"].is_number_integer()) {
hires_upscale_tile_size = hires_json["upscale_tile_size"]; hires_upscale_tile_size = hires_json["upscale_tile_size"];
} }
@ -2080,6 +2122,10 @@ bool SDGenerationParams::validate(SDMode mode) {
LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]"); LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]");
return false; return false;
} }
if (!hires_custom_sigmas.empty() && hires_custom_sigmas.size() < 2) {
LOG_ERROR("error: hires custom sigmas must contain at least two values");
return false;
}
if (hires_upscale_tile_size < 1) { if (hires_upscale_tile_size < 1) {
LOG_ERROR("error: hires upscale tile size must be positive"); LOG_ERROR("error: hires upscale tile size must be positive");
return false; return false;
@ -2174,15 +2220,17 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
params.vae_tiling_params = vae_tiling_params; params.vae_tiling_params = vae_tiling_params;
params.cache = cache_params; params.cache = cache_params;
params.hires.enabled = hires_enabled; params.hires.enabled = hires_enabled;
params.hires.upscaler = resolved_hires_upscaler; params.hires.upscaler = resolved_hires_upscaler;
params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str(); params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
params.hires.scale = hires_scale; params.hires.scale = hires_scale;
params.hires.target_width = hires_width; params.hires.target_width = hires_width;
params.hires.target_height = hires_height; params.hires.target_height = hires_height;
params.hires.steps = hires_steps; params.hires.steps = hires_steps;
params.hires.denoising_strength = hires_denoising_strength; params.hires.denoising_strength = hires_denoising_strength;
params.hires.upscale_tile_size = hires_upscale_tile_size; params.hires.upscale_tile_size = hires_upscale_tile_size;
params.hires.custom_sigmas = hires_custom_sigmas.empty() ? nullptr : hires_custom_sigmas.data();
params.hires.custom_sigmas_count = static_cast<int>(hires_custom_sigmas.size());
return params; return params;
} }
@ -2215,27 +2263,38 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str(); high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str();
cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str(); cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str();
params.loras = lora_vec.empty() ? nullptr : lora_vec.data(); params.loras = lora_vec.empty() ? nullptr : lora_vec.data();
params.lora_count = static_cast<uint32_t>(lora_vec.size()); params.lora_count = static_cast<uint32_t>(lora_vec.size());
params.prompt = prompt.c_str(); params.prompt = prompt.c_str();
params.negative_prompt = negative_prompt.c_str(); params.negative_prompt = negative_prompt.c_str();
params.clip_skip = clip_skip; params.clip_skip = clip_skip;
params.init_image = init_image.get(); params.init_image = init_image.get();
params.end_image = end_image.get(); params.end_image = end_image.get();
params.control_frames = control_frame_views.empty() ? nullptr : control_frame_views.data(); params.control_frames = control_frame_views.empty() ? nullptr : control_frame_views.data();
params.control_frames_size = static_cast<int>(control_frame_views.size()); params.control_frames_size = static_cast<int>(control_frame_views.size());
params.width = get_resolved_width(); params.width = get_resolved_width();
params.height = get_resolved_height(); params.height = get_resolved_height();
params.sample_params = sample_params; params.sample_params = sample_params;
params.high_noise_sample_params = high_noise_sample_params; params.high_noise_sample_params = high_noise_sample_params;
params.moe_boundary = moe_boundary; params.moe_boundary = moe_boundary;
params.strength = strength; params.strength = strength;
params.seed = seed; params.seed = seed;
params.video_frames = video_frames; params.video_frames = video_frames;
params.fps = fps; params.fps = fps;
params.vace_strength = vace_strength; params.vace_strength = vace_strength;
params.vae_tiling_params = vae_tiling_params; params.vae_tiling_params = vae_tiling_params;
params.cache = cache_params; params.cache = cache_params;
params.hires.enabled = hires_enabled;
params.hires.upscaler = resolved_hires_upscaler;
params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
params.hires.scale = hires_scale;
params.hires.target_width = hires_width;
params.hires.target_height = hires_height;
params.hires.steps = hires_steps;
params.hires.denoising_strength = hires_denoising_strength;
params.hires.upscale_tile_size = hires_upscale_tile_size;
params.hires.custom_sigmas = hires_custom_sigmas.empty() ? nullptr : hires_custom_sigmas.data();
params.hires.custom_sigmas_count = static_cast<int>(hires_custom_sigmas.size());
return params; return params;
} }
@ -2318,6 +2377,7 @@ std::string SDGenerationParams::to_string() const {
<< ", target_height: " << hires_height << ", target_height: " << hires_height
<< ", steps: " << hires_steps << ", steps: " << hires_steps
<< ", denoising_strength: " << hires_denoising_strength << ", denoising_strength: " << hires_denoising_strength
<< ", custom_sigmas: " << vec_to_string(hires_custom_sigmas)
<< ", upscale_tile_size: " << hires_upscale_tile_size << " },\n" << ", upscale_tile_size: " << hires_upscale_tile_size << " },\n"
<< " vae_tiling_params: { " << " vae_tiling_params: { "
<< vae_tiling_params.enabled << ", " << vae_tiling_params.enabled << ", "
@ -2469,6 +2529,7 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
{"target_height", gen_params.hires_height}, {"target_height", gen_params.hires_height},
{"steps", gen_params.hires_steps}, {"steps", gen_params.hires_steps},
{"denoising_strength", gen_params.hires_denoising_strength}, {"denoising_strength", gen_params.hires_denoising_strength},
{"custom_sigmas", gen_params.hires_custom_sigmas},
{"upscale_tile_size", gen_params.hires_upscale_tile_size}, {"upscale_tile_size", gen_params.hires_upscale_tile_size},
}; };
} }
@ -2588,6 +2649,9 @@ std::string get_image_params(const SDContextParams& ctx_params,
parameter_string += "Hires resize: " + std::to_string(gen_params.hires_width) + "x" + std::to_string(gen_params.hires_height) + ", "; parameter_string += "Hires resize: " + std::to_string(gen_params.hires_width) + "x" + std::to_string(gen_params.hires_height) + ", ";
parameter_string += "Hires steps: " + std::to_string(gen_params.hires_steps) + ", "; parameter_string += "Hires steps: " + std::to_string(gen_params.hires_steps) + ", ";
parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", "; parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", ";
if (!gen_params.hires_custom_sigmas.empty()) {
parameter_string += "Hires custom sigmas: " + vec_to_string(gen_params.hires_custom_sigmas) + ", ";
}
} }
parameter_string += "Version: stable-diffusion.cpp"; parameter_string += "Version: stable-diffusion.cpp";
parameter_string += ", SDCPP: " + build_sdcpp_image_metadata_json(ctx_params, gen_params, seed, mode); parameter_string += ", SDCPP: " + build_sdcpp_image_metadata_json(ctx_params, gen_params, seed, mode);

View File

@ -207,6 +207,7 @@ struct SDGenerationParams {
int hires_steps = 0; int hires_steps = 0;
float hires_denoising_strength = 0.7f; float hires_denoising_strength = 0.7f;
int hires_upscale_tile_size = 128; int hires_upscale_tile_size = 128;
std::vector<float> hires_custom_sigmas;
std::map<std::string, float> lora_map; std::map<std::string, float> lora_map;
std::map<std::string, float> high_noise_lora_map; std::map<std::string, float> high_noise_lora_map;

View File

@ -277,6 +277,8 @@ Default Generation Options:
model-specific model-specific
--sigmas custom sigma values for the sampler, comma-separated (e.g., --sigmas custom sigma values for the sampler, comma-separated (e.g.,
"14.61,7.8,3.5,0.0"). "14.61,7.8,3.5,0.0").
--hires-sigmas custom sigma values for the highres fix second pass, comma-separated (e.g.,
"0.85,0.725,0.421875,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9]) --skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) -r, --ref-image reference image for Flux Kontext models (can be used multiple times)

View File

@ -532,6 +532,7 @@ Shared default fields used by both `img_gen` and `vid_gen`:
| `hires.target_height` | `integer` | | `hires.target_height` | `integer` |
| `hires.steps` | `integer` | | `hires.steps` | `integer` |
| `hires.denoising_strength` | `number` | | `hires.denoising_strength` | `number` |
| `hires.custom_sigmas` | `array<number>` |
| `hires.upscale_tile_size` | `integer` | | `hires.upscale_tile_size` | `integer` |
`vid_gen`-specific default fields: `vid_gen`-specific default fields:
@ -685,6 +686,7 @@ Example:
"target_height": 0, "target_height": 0,
"steps": 0, "steps": 0,
"denoising_strength": 0.7, "denoising_strength": 0.7,
"custom_sigmas": [],
"upscale_tile_size": 128 "upscale_tile_size": 128
}, },
@ -799,6 +801,7 @@ Other native fields:
| `hires.target_height` | `integer` | | `hires.target_height` | `integer` |
| `hires.steps` | `integer` | | `hires.steps` | `integer` |
| `hires.denoising_strength` | `number` | | `hires.denoising_strength` | `number` |
| `hires.custom_sigmas` | `array<number>` |
| `hires.upscale_tile_size` | `integer` | | `hires.upscale_tile_size` | `integer` |
| `vae_tiling_params` | `object` | | `vae_tiling_params` | `object` |
| `cache_mode` | `string` | | `cache_mode` | `string` |
@ -806,7 +809,7 @@ Other native fields:
| `scm_mask` | `string` | | `scm_mask` | `string` |
| `scm_policy_dynamic` | `boolean` | | `scm_policy_dynamic` | `boolean` |
For `hires.upscaler`, use `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, `Latent (bicubic antialiased)`, or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory. For `hires.upscaler`, use `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, `Latent (bicubic antialiased)`, or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory. `hires.custom_sigmas`, when present, overrides the generated second-pass hires sigma schedule; otherwise the hires schedule is trimmed by `hires.denoising_strength`.
HTTP-only output fields: HTTP-only output fields:

View File

@ -100,6 +100,20 @@ static json make_sample_params_json(const sd_sample_params_t& sample_params, con
}; };
} }
static json make_hires_json(const SDGenerationParams& defaults) {
return {
{"enabled", defaults.hires_enabled},
{"upscaler", defaults.hires_upscaler},
{"scale", defaults.hires_scale},
{"target_width", defaults.hires_width},
{"target_height", defaults.hires_height},
{"steps", defaults.hires_steps},
{"denoising_strength", defaults.hires_denoising_strength},
{"custom_sigmas", defaults.hires_custom_sigmas},
{"upscale_tile_size", defaults.hires_upscale_tile_size},
};
}
static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) { static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) {
return { return {
{"prompt", defaults.prompt}, {"prompt", defaults.prompt},
@ -114,17 +128,7 @@ static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const
{"increase_ref_index", defaults.increase_ref_index}, {"increase_ref_index", defaults.increase_ref_index},
{"control_strength", defaults.control_strength}, {"control_strength", defaults.control_strength},
{"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)}, {"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
{"hires", {"hires", make_hires_json(defaults)},
{
{"enabled", defaults.hires_enabled},
{"upscaler", defaults.hires_upscaler},
{"scale", defaults.hires_scale},
{"target_width", defaults.hires_width},
{"target_height", defaults.hires_height},
{"steps", defaults.hires_steps},
{"denoising_strength", defaults.hires_denoising_strength},
{"upscale_tile_size", defaults.hires_upscale_tile_size},
}},
{"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)}, {"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
{"cache_mode", defaults.cache_mode}, {"cache_mode", defaults.cache_mode},
{"cache_option", defaults.cache_option}, {"cache_option", defaults.cache_option},
@ -150,6 +154,7 @@ static json make_vid_gen_defaults_json(const SDGenerationParams& defaults, const
{"vace_strength", defaults.vace_strength}, {"vace_strength", defaults.vace_strength},
{"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)}, {"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
{"high_noise_sample_params", make_sample_params_json(defaults.high_noise_sample_params, defaults.high_noise_skip_layers)}, {"high_noise_sample_params", make_sample_params_json(defaults.high_noise_sample_params, defaults.high_noise_skip_layers)},
{"hires", make_hires_json(defaults)},
{"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)}, {"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
{"cache_mode", defaults.cache_mode}, {"cache_mode", defaults.cache_mode},
{"cache_option", defaults.cache_option}, {"cache_option", defaults.cache_option},

View File

@ -332,6 +332,8 @@ typedef struct {
int steps; int steps;
float denoising_strength; float denoising_strength;
int upscale_tile_size; int upscale_tile_size;
float* custom_sigmas;
int custom_sigmas_count;
} sd_hires_params_t; } sd_hires_params_t;
typedef struct { typedef struct {
@ -382,6 +384,7 @@ typedef struct {
float vace_strength; float vace_strength;
sd_tiling_params_t vae_tiling_params; sd_tiling_params_t vae_tiling_params;
sd_cache_params_t cache; sd_cache_params_t cache;
sd_hires_params_t hires;
} sd_vid_gen_params_t; } sd_vid_gen_params_t;
typedef struct sd_ctx_t sd_ctx_t; typedef struct sd_ctx_t sd_ctx_t;

348
src/ltx_latent_upscaler.hpp Normal file
View File

@ -0,0 +1,348 @@
#ifndef __SD_LTX_LATENT_UPSCALER_HPP__
#define __SD_LTX_LATENT_UPSCALER_HPP__
#include <cinttypes>
#include <cmath>
#include <cstdlib>
#include <map>
#include <memory>
#include <string>
#include <utility>
#include "common_dit.hpp"
#include "ggml_extend.hpp"
#include "ggml_graph_cut.h"
#include "model.h"
#include "util.h"
namespace LTXVUpsampler {
constexpr int LTX_UPSAMPLER_GRAPH_SIZE = 10240;
struct LatentUpsamplerConfig {
int64_t in_channels = 128;
int64_t mid_channels = 1024;
int num_blocks_per_stage = 4;
int dims = 3;
bool spatial_upsample = true;
bool temporal_upsample = false;
bool rational_resampler = false;
};
static inline bool has_tensor(const String2TensorStorage& tensor_storage_map,
const std::string& name) {
return tensor_storage_map.find(name) != tensor_storage_map.end();
}
static inline int64_t get_tensor_ne0(const String2TensorStorage& tensor_storage_map,
const std::string& name,
int64_t fallback) {
auto it = tensor_storage_map.find(name);
if (it == tensor_storage_map.end()) {
return fallback;
}
return it->second.ne[0];
}
static inline int count_module_blocks(const String2TensorStorage& tensor_storage_map,
const std::string& module_name) {
int max_block = -1;
const std::string prefix = module_name + ".";
for (const auto& pair : tensor_storage_map) {
const std::string& name = pair.first;
if (name.find(prefix) != 0) {
continue;
}
size_t begin = prefix.size();
size_t end = name.find('.', begin);
if (end == std::string::npos) {
continue;
}
int index = atoi(name.substr(begin, end - begin).c_str());
max_block = std::max(max_block, index);
}
return max_block + 1;
}
static inline LatentUpsamplerConfig detect_config_from_weights(const String2TensorStorage& tensor_storage_map) {
LatentUpsamplerConfig config;
config.mid_channels = get_tensor_ne0(tensor_storage_map, "initial_norm.weight", config.mid_channels);
config.in_channels = get_tensor_ne0(tensor_storage_map, "final_conv.bias", config.in_channels);
int detected_blocks = count_module_blocks(tensor_storage_map, "res_blocks");
if (detected_blocks > 0) {
config.num_blocks_per_stage = detected_blocks;
}
config.spatial_upsample = has_tensor(tensor_storage_map, "upsampler.0.weight");
config.temporal_upsample = has_tensor(tensor_storage_map, "temporal_upsampler.0.weight");
return config;
}
class VideoGroupNorm : public GGMLBlock {
protected:
int num_groups;
int64_t num_channels;
float eps;
std::string prefix;
void init_params(ggml_context* ctx,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "") override {
SD_UNUSED(tensor_storage_map);
this->prefix = prefix;
params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_channels);
params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_channels);
}
public:
VideoGroupNorm(int num_groups, int64_t num_channels, float eps = 1e-05f)
: num_groups(num_groups),
num_channels(num_channels),
eps(eps) {}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// LTX video latent layout is [W, H, T, C]. ggml_group_norm treats ne[2]
// as channels, so fold only H/T internally and restore the same layout.
GGML_ASSERT(x->ne[3] == num_channels);
const int64_t W = x->ne[0];
const int64_t H = x->ne[1];
const int64_t T = x->ne[2];
x = ggml_ext_cont(ctx->ggml_ctx, x);
x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H * T, num_channels, 1);
x = ggml_group_norm(ctx->ggml_ctx, x, num_groups, eps);
ggml_tensor* weight = params["weight"];
ggml_tensor* bias = params["bias"];
if (ctx->weight_adapter) {
weight = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, weight, prefix + "weight");
bias = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, bias, prefix + "bias");
}
weight = ggml_reshape_4d(ctx->ggml_ctx, weight, 1, 1, num_channels, 1);
bias = ggml_reshape_4d(ctx->ggml_ctx, bias, 1, 1, num_channels, 1);
x = ggml_mul_inplace(ctx->ggml_ctx, x, weight);
x = ggml_add_inplace(ctx->ggml_ctx, x, bias);
return ggml_reshape_4d(ctx->ggml_ctx, x, W, H, T, num_channels);
}
};
class ResBlock : public GGMLBlock {
public:
ResBlock(int64_t channels, int dims = 3) {
GGML_ASSERT(dims == 3);
blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv3d(channels, channels, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}));
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, channels));
blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv3d(channels, channels, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}));
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, channels));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto conv1 = std::dynamic_pointer_cast<Conv3d>(blocks["conv1"]);
auto norm1 = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["norm1"]);
auto conv2 = std::dynamic_pointer_cast<Conv3d>(blocks["conv2"]);
auto norm2 = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["norm2"]);
ggml_tensor* residual = x;
x = conv1->forward(ctx, x);
x = norm1->forward(ctx, x);
x = ggml_silu_inplace(ctx->ggml_ctx, x);
x = conv2->forward(ctx, x);
x = norm2->forward(ctx, x);
x = ggml_add(ctx->ggml_ctx, x, residual);
return ggml_silu(ctx->ggml_ctx, x);
}
};
class PixelShuffleND : public UnaryBlock {
protected:
int upscale_factor;
public:
explicit PixelShuffleND(int upscale_factor)
: upscale_factor(upscale_factor) {}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
GGML_ASSERT(upscale_factor == 2);
int64_t h = x->ne[1];
int64_t w = x->ne[0];
// x: [b*f, c*4, h, w] -> [b*f, c, h*2, w*2]
x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 2, 0, 1, 3)); // [b*f, h, w, c*4]
x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0], x->ne[1] * x->ne[2], x->ne[3]); // [b*f, h*w, c*4]
return DiT::unpatchify(ctx->ggml_ctx, x, h, w, upscale_factor, upscale_factor, true);
}
};
class LatentUpsampler : public GGMLBlock {
public:
LatentUpsamplerConfig config;
explicit LatentUpsampler(LatentUpsamplerConfig config)
: config(std::move(config)) {
GGML_ASSERT(this->config.dims == 3);
GGML_ASSERT(this->config.spatial_upsample);
GGML_ASSERT(!this->config.temporal_upsample);
GGML_ASSERT(!this->config.rational_resampler);
blocks["initial_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(this->config.in_channels,
this->config.mid_channels,
{3, 3, 3},
{1, 1, 1},
{1, 1, 1}));
blocks["initial_norm"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, this->config.mid_channels));
for (int i = 0; i < this->config.num_blocks_per_stage; ++i) {
blocks["res_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new ResBlock(this->config.mid_channels, this->config.dims));
}
blocks["upsampler.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(this->config.mid_channels,
4 * this->config.mid_channels,
{3, 3},
{1, 1},
{1, 1}));
blocks["upsampler.1"] = std::shared_ptr<GGMLBlock>(new PixelShuffleND(2));
for (int i = 0; i < this->config.num_blocks_per_stage; ++i) {
blocks["post_upsample_res_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new ResBlock(this->config.mid_channels, this->config.dims));
}
blocks["final_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(this->config.mid_channels,
this->config.in_channels,
{3, 3, 3},
{1, 1, 1},
{1, 1, 1}));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [b*c, f, h, w]
// return: [b*c, f, h*2, w*2]
auto initial_conv = std::dynamic_pointer_cast<Conv3d>(blocks["initial_conv"]);
auto initial_norm = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["initial_norm"]);
auto upsample_conv = std::dynamic_pointer_cast<Conv2d>(blocks["upsampler.0"]);
auto pixel_shuffle = std::dynamic_pointer_cast<PixelShuffleND>(blocks["upsampler.1"]);
auto final_conv = std::dynamic_pointer_cast<Conv3d>(blocks["final_conv"]);
x = initial_conv->forward(ctx, x);
x = initial_norm->forward(ctx, x);
x = ggml_silu(ctx->ggml_ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.initial", "x");
for (int i = 0; i < config.num_blocks_per_stage; ++i) {
auto block = std::dynamic_pointer_cast<ResBlock>(blocks["res_blocks." + std::to_string(i)]);
x = block->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.res_blocks." + std::to_string(i), "x");
}
// rearrange(x, "b c f h w -> (b f) c h w"),
x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); // [b*f, c, h, w]
x = upsample_conv->forward(ctx, x); // [b*f, c*4, h, w]
x = pixel_shuffle->forward(ctx, x); // [b*f, c, h*2, w*2]
x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); // [b*c, f, h, w]
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.spatial_up", "x");
for (int i = 0; i < config.num_blocks_per_stage; ++i) {
auto block = std::dynamic_pointer_cast<ResBlock>(blocks["post_upsample_res_blocks." + std::to_string(i)]);
x = block->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.post_blocks." + std::to_string(i), "x");
}
x = final_conv->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.final", "x");
return x;
}
};
struct LatentUpsamplerRunner : public GGMLRunner {
std::unique_ptr<LatentUpsampler> model;
LatentUpsamplerRunner(ggml_backend_t backend,
ggml_backend_t params_backend)
: GGMLRunner(backend, params_backend) {}
std::string get_desc() override {
return "ltx_latent_upsampler";
}
bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading LTX latent upsampler from '%s'", file_path.c_str());
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
LOG_ERROR("init LTX latent upsampler model loader from file failed: '%s'", file_path.c_str());
return false;
}
const auto& tensor_storage_map = model_loader.get_tensor_storage_map();
if (!has_tensor(tensor_storage_map, "post_upsample_res_blocks.0.conv2.bias") ||
!has_tensor(tensor_storage_map, "upsampler.0.weight")) {
LOG_ERROR("unsupported LTX latent upsampler weights: expected spatial upsampler tensors");
return false;
}
LatentUpsamplerConfig config = detect_config_from_weights(tensor_storage_map);
if (config.dims != 3 || !config.spatial_upsample || config.temporal_upsample ||
config.rational_resampler) {
LOG_ERROR("unsupported LTX latent upsampler config: dims=%d spatial=%d temporal=%d rational=%d",
config.dims,
config.spatial_upsample,
config.temporal_upsample,
config.rational_resampler);
return false;
}
model = std::make_unique<LatentUpsampler>(config);
model->init(params_ctx, tensor_storage_map, "");
if (!alloc_params_buffer()) {
LOG_ERROR("LTX latent upsampler params buffer allocation failed");
return false;
}
std::map<std::string, ggml_tensor*> tensors;
model->get_param_tensors(tensors);
if (!model_loader.load_tensors(tensors, {}, n_threads)) {
LOG_ERROR("load LTX latent upsampler tensors failed");
return false;
}
LOG_INFO("LTX latent upsampler loaded: in_channels=%" PRId64 ", mid_channels=%" PRId64 ", blocks=%d",
config.in_channels,
config.mid_channels,
config.num_blocks_per_stage);
return true;
}
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
if (!model) {
return nullptr;
}
ggml_cgraph* gf = new_graph_custom(LTX_UPSAMPLER_GRAPH_SIZE);
ggml_tensor* x = make_input(x_tensor);
auto runner_ctx = get_context();
ggml_tensor* out = model->forward(&runner_ctx, x);
ggml_build_forward_expand(gf, out);
return gf;
}
sd::Tensor<float> compute(const int n_threads,
const sd::Tensor<float>& x) {
if (!model) {
LOG_ERROR("LTX latent upsampler is not loaded");
return {};
}
if (x.dim() != 4 && x.dim() != 5) {
LOG_ERROR("LTX latent upsampler expects 4D or 5D video latent, got dim=%lld",
(long long)x.dim());
return {};
}
if (x.dim() == 5 && x.shape()[4] != 1) {
LOG_ERROR("LTX latent upsampler currently supports batch size 1, got batch=%lld",
(long long)x.shape()[4]);
return {};
}
if (x.shape()[3] != model->config.in_channels) {
LOG_ERROR("LTX latent upsampler expected %" PRId64 " channels, got %lld",
model->config.in_channels,
(long long)x.shape()[3]);
return {};
}
size_t expected_dim = static_cast<size_t>(x.dim());
auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); };
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), expected_dim);
}
};
} // namespace LTXVUpsampler
#endif // __SD_LTX_LATENT_UPSCALER_HPP__

View File

@ -1123,6 +1123,18 @@ namespace LTXVAE {
mean = ggml_cont(ctx->ggml_ctx, mean); mean = ggml_cont(ctx->ggml_ctx, mean);
return processor->normalize(ctx, mean); return processor->normalize(ctx, mean);
} }
ggml_tensor* normalize_latents(GGMLRunnerContext* ctx,
ggml_tensor* x) {
auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
return processor->normalize(ctx, x);
}
ggml_tensor* un_normalize_latents(GGMLRunnerContext* ctx,
ggml_tensor* x) {
auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
return processor->un_normalize(ctx, x);
}
}; };
} // namespace LTXVAE } // namespace LTXVAE
@ -1192,6 +1204,17 @@ struct LTXVideoVAE : public VAE {
return gf; return gf;
} }
ggml_cgraph* build_latent_statistics_graph(const sd::Tensor<float>& z_tensor, bool normalize) {
ggml_cgraph* gf = new_graph_custom(1024);
ggml_tensor* z = make_input(z_tensor);
auto runner_ctx = get_context();
ggml_tensor* out = normalize ? vae.normalize_latents(&runner_ctx, z)
: vae.un_normalize_latents(&runner_ctx, z);
ggml_build_forward_expand(gf, out);
return gf;
}
sd::Tensor<float> _compute(const int n_threads, sd::Tensor<float> _compute(const int n_threads,
const sd::Tensor<float>& z, const sd::Tensor<float>& z,
bool decode_graph) override { bool decode_graph) override {
@ -1226,6 +1249,26 @@ struct LTXVideoVAE : public VAE {
return result; return result;
} }
sd::Tensor<float> apply_latent_statistics(const int n_threads,
const sd::Tensor<float>& z,
bool normalize) {
auto get_graph = [&]() -> ggml_cgraph* {
return build_latent_statistics_graph(z, normalize);
};
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false),
static_cast<size_t>(z.dim()));
}
sd::Tensor<float> normalize_latents(const int n_threads,
const sd::Tensor<float>& z) {
return apply_latent_statistics(n_threads, z, true);
}
sd::Tensor<float> un_normalize_latents(const int n_threads,
const sd::Tensor<float>& z) {
return apply_latent_statistics(n_threads, z, false);
}
int get_encoder_output_channels(int input_channels) override { int get_encoder_output_channels(int input_channels) override {
SD_UNUSED(input_channels); SD_UNUSED(input_channels);
return 256; return 256;

View File

@ -17,6 +17,7 @@
#include "guidance.h" #include "guidance.h"
#include "lora.hpp" #include "lora.hpp"
#include "ltx_audio_vae.h" #include "ltx_audio_vae.h"
#include "ltx_latent_upscaler.hpp"
#include "ltx_vae.hpp" #include "ltx_vae.hpp"
#include "pmid.hpp" #include "pmid.hpp"
#include "sample-cache.h" #include "sample-cache.h"
@ -2198,6 +2199,24 @@ public:
return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
} }
sd::Tensor<float> normalize_ltx_video_latents(const sd::Tensor<float>& x) {
auto ltx_vae = std::dynamic_pointer_cast<LTXVideoVAE>(first_stage_model);
if (!ltx_vae) {
LOG_ERROR("LTX latent normalization requires LTX video VAE");
return {};
}
return ltx_vae->normalize_latents(n_threads, x);
}
sd::Tensor<float> un_normalize_ltx_video_latents(const sd::Tensor<float>& x) {
auto ltx_vae = std::dynamic_pointer_cast<LTXVideoVAE>(first_stage_model);
if (!ltx_vae) {
LOG_ERROR("LTX latent un-normalization requires LTX video VAE");
return {};
}
return ltx_vae->un_normalize_latents(n_threads, x);
}
sd::Tensor<float> decode_ltx_audio_latent(const sd::Tensor<float>& audio_latent) { sd::Tensor<float> decode_ltx_audio_latent(const sd::Tensor<float>& audio_latent) {
if (audio_vae_model == nullptr || audio_latent.empty()) { if (audio_vae_model == nullptr || audio_latent.empty()) {
return {}; return {};
@ -2464,16 +2483,18 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) {
} }
void sd_hires_params_init(sd_hires_params_t* hires_params) { void sd_hires_params_init(sd_hires_params_t* hires_params) {
*hires_params = {}; *hires_params = {};
hires_params->enabled = false; hires_params->enabled = false;
hires_params->upscaler = SD_HIRES_UPSCALER_LATENT; hires_params->upscaler = SD_HIRES_UPSCALER_LATENT;
hires_params->model_path = nullptr; hires_params->model_path = nullptr;
hires_params->scale = 2.0f; hires_params->scale = 2.0f;
hires_params->target_width = 0; hires_params->target_width = 0;
hires_params->target_height = 0; hires_params->target_height = 0;
hires_params->steps = 0; hires_params->steps = 0;
hires_params->denoising_strength = 0.7f; hires_params->denoising_strength = 0.7f;
hires_params->upscale_tile_size = 128; hires_params->upscale_tile_size = 128;
hires_params->custom_sigmas = nullptr;
hires_params->custom_sigmas_count = 0;
} }
void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
@ -2746,6 +2767,16 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
sd_vid_gen_params->moe_boundary = 0.875f; sd_vid_gen_params->moe_boundary = 0.875f;
sd_vid_gen_params->vace_strength = 1.f; sd_vid_gen_params->vace_strength = 1.f;
sd_vid_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f}; sd_vid_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f};
sd_vid_gen_params->hires.enabled = false;
sd_vid_gen_params->hires.upscaler = SD_HIRES_UPSCALER_LATENT;
sd_vid_gen_params->hires.scale = 2.f;
sd_vid_gen_params->hires.target_width = 0;
sd_vid_gen_params->hires.target_height = 0;
sd_vid_gen_params->hires.steps = 0;
sd_vid_gen_params->hires.denoising_strength = 0.7f;
sd_vid_gen_params->hires.upscale_tile_size = 128;
sd_vid_gen_params->hires.custom_sigmas = nullptr;
sd_vid_gen_params->hires.custom_sigmas_count = 0;
sd_cache_params_init(&sd_vid_gen_params->cache); sd_cache_params_init(&sd_vid_gen_params->cache);
} }
@ -2995,6 +3026,7 @@ struct GenerationRequest {
vace_strength = sd_vid_gen_params->vace_strength; vace_strength = sd_vid_gen_params->vace_strength;
guidance = sd_vid_gen_params->sample_params.guidance; guidance = sd_vid_gen_params->sample_params.guidance;
high_noise_guidance = sd_vid_gen_params->high_noise_sample_params.guidance; high_noise_guidance = sd_vid_gen_params->high_noise_sample_params.guidance;
hires = sd_vid_gen_params->hires;
resolve(sd_ctx); resolve(sd_ctx);
if (frames != requested_frames) { if (frames != requested_frames) {
LOG_WARN("align video frames from %d to %d for %s", LOG_WARN("align video frames from %d to %d for %s",
@ -3053,6 +3085,20 @@ struct GenerationRequest {
hires.enabled = false; hires.enabled = false;
return; return;
} }
if (hires.custom_sigmas_count < 0) {
LOG_WARN("hires custom sigmas count is negative, ignoring custom sigmas");
hires.custom_sigmas = nullptr;
hires.custom_sigmas_count = 0;
}
if (hires.custom_sigmas_count > 0 && hires.custom_sigmas == nullptr) {
LOG_WARN("hires custom sigmas count is positive but custom sigmas are null, ignoring custom sigmas");
hires.custom_sigmas_count = 0;
}
if (hires.custom_sigmas_count == 1) {
LOG_WARN("hires custom sigmas requires at least two values, ignoring custom sigmas");
hires.custom_sigmas = nullptr;
hires.custom_sigmas_count = 0;
}
hires.denoising_strength = std::clamp(hires.denoising_strength, 0.0001f, 1.f); hires.denoising_strength = std::clamp(hires.denoising_strength, 0.0001f, 1.f);
hires.steps = std::max(0, hires.steps); hires.steps = std::max(0, hires.steps);
@ -3417,6 +3463,85 @@ static sd::Tensor<float> pack_ltxav_audio_and_video_denoise_mask(const sd::Tenso
return sd::ops::concat(video_mask_full, audio_mask, 3); return sd::ops::concat(video_mask_full, audio_mask, 3);
} }
static sd::Tensor<float> make_ltxav_video_denoise_mask(const sd::Tensor<float>& video_latent, float value = 1.f) {
if (video_latent.empty()) {
return {};
}
return sd::full<float>({video_latent.shape()[0],
video_latent.shape()[1],
video_latent.shape()[2],
1,
1},
value);
}
static sd::Tensor<float> encode_ltxav_condition_image(sd_ctx_t* sd_ctx,
const sd::Tensor<float>& image,
const char* name) {
if (sd_ctx == nullptr || sd_ctx->sd == nullptr || image.empty()) {
return {};
}
auto condition_image = image.reshape({image.shape()[0],
image.shape()[1],
1,
image.shape()[2],
image.shape()[3]});
auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
if (condition_latent.empty()) {
LOG_ERROR("failed to encode LTXAV %s image", name);
}
return condition_latent;
}
static bool apply_ltxav_condition_by_latent_index(sd::Tensor<float>* video_latent,
sd::Tensor<float>* video_mask,
const sd::Tensor<float>& condition_latent,
int64_t latent_idx,
const char* name,
float conditioned_mask) {
if (video_latent == nullptr || video_mask == nullptr || video_latent->empty() || video_mask->empty()) {
return false;
}
if (condition_latent.empty() ||
condition_latent.shape()[0] != video_latent->shape()[0] ||
condition_latent.shape()[1] != video_latent->shape()[1] ||
condition_latent.shape()[3] != video_latent->shape()[3]) {
LOG_ERROR("invalid LTXAV %s condition latent shape", name);
return false;
}
int64_t latent_frames = video_latent->shape()[2];
int64_t condition_frames = condition_latent.shape()[2];
if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
name,
latent_idx,
condition_frames,
latent_frames);
return false;
}
sd::ops::slice_assign(video_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
sd::ops::fill_slice(video_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
return true;
}
static bool apply_ltxav_condition_image_by_latent_index(sd_ctx_t* sd_ctx,
const sd::Tensor<float>& image,
sd::Tensor<float>* video_latent,
sd::Tensor<float>* video_mask,
int64_t latent_idx,
const char* name,
float strength) {
auto condition_latent = encode_ltxav_condition_image(sd_ctx, image, name);
return !condition_latent.empty() &&
apply_ltxav_condition_by_latent_index(video_latent,
video_mask,
condition_latent,
latent_idx,
name,
1.0f - std::clamp(strength, 0.f, 1.f));
}
static sd::Tensor<float> unpack_ltxav_audio_latent(const sd::Tensor<float>& packed_latent, static sd::Tensor<float> unpack_ltxav_audio_latent(const sd::Tensor<float>& packed_latent,
int audio_length, int audio_length,
int video_channels) { int video_channels) {
@ -3978,6 +4103,53 @@ static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
return {}; return {};
} }
static std::vector<float> make_hires_sigma_schedule(sd_ctx_t* sd_ctx,
const sd_hires_params_t& hires,
const sd_sample_params_t& sample_params,
sample_method_t sample_method,
int default_steps,
int sample_seq_len,
int* scheduler_steps_out) {
if (scheduler_steps_out != nullptr) {
*scheduler_steps_out = 0;
}
if (hires.custom_sigmas_count > 0 && hires.custom_sigmas != nullptr) {
std::vector<float> custom_sigmas(hires.custom_sigmas,
hires.custom_sigmas + hires.custom_sigmas_count);
if (scheduler_steps_out != nullptr) {
*scheduler_steps_out = static_cast<int>(custom_sigmas.size()) - 1;
}
return custom_sigmas;
}
int effective_steps = hires.steps > 0 ? hires.steps : default_steps;
effective_steps = std::max(1, effective_steps);
// sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps,
// unlike img2img which trims from a fixed step count.
int scheduler_steps = static_cast<int>(effective_steps / hires.denoising_strength);
scheduler_steps = std::max(1, scheduler_steps);
scheduler_t scheduler = resolve_scheduler(sd_ctx,
sample_params.scheduler,
sample_method);
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(scheduler_steps,
sample_seq_len,
scheduler,
sd_ctx->sd->version,
sample_params.extra_sample_args);
size_t t_enc = static_cast<size_t>(scheduler_steps * hires.denoising_strength);
if (t_enc >= static_cast<size_t>(scheduler_steps)) {
t_enc = static_cast<size_t>(scheduler_steps) - 1;
}
if (scheduler_steps_out != nullptr) {
*scheduler_steps_out = scheduler_steps;
}
return std::vector<float>(sigmas.begin() + scheduler_steps - static_cast<int>(t_enc) - 1,
sigmas.end());
}
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
if (sd_ctx == nullptr || sd_img_gen_params == nullptr) { if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
return nullptr; return nullptr;
@ -4100,29 +4272,20 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
} }
} }
int hires_steps = request.hires.steps > 0 ? request.hires.steps : plan.sample_steps; int hires_scheduler_steps = 0;
std::vector<float> hires_sigma_sched =
// sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps, make_hires_sigma_schedule(sd_ctx,
// unlike img2img which trims from a fixed step count request.hires,
hires_steps = static_cast<int>(hires_steps / request.hires.denoising_strength); sd_img_gen_params->sample_params,
plan.sample_method,
std::vector<float> hires_sigmas = sd_ctx->sd->denoiser->get_sigmas( plan.sample_steps,
hires_steps, sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width),
sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width), &hires_scheduler_steps);
sd_img_gen_params->sample_params.scheduler, LOG_INFO("hires fix: scheduler_steps=%d, denoising_strength=%.2f, sigma_sched_size=%zu%s",
sd_ctx->sd->version, hires_scheduler_steps,
sd_img_gen_params->sample_params.extra_sample_args);
size_t t_enc = static_cast<size_t>(hires_steps * request.hires.denoising_strength);
if (t_enc >= static_cast<size_t>(hires_steps)) {
t_enc = static_cast<size_t>(hires_steps) - 1;
}
std::vector<float> hires_sigma_sched(hires_sigmas.begin() + hires_steps - static_cast<int>(t_enc) - 1,
hires_sigmas.end());
LOG_INFO("hires fix: %d steps, denoising_strength=%.2f, sigma_sched_size=%zu",
hires_steps,
request.hires.denoising_strength, request.hires.denoising_strength,
hires_sigma_sched.size()); hires_sigma_sched.size(),
request.hires.custom_sigmas_count > 0 ? ", custom_sigmas=true" : "");
std::vector<sd::Tensor<float>> hires_final_latents; std::vector<sd::Tensor<float>> hires_final_latents;
int64_t hires_denoise_start = ggml_time_ms(); int64_t hires_denoise_start = ggml_time_ms();
@ -4270,44 +4433,7 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
float conditioning_strength = std::clamp(request->strength, 0.f, 1.f); float conditioning_strength = std::clamp(request->strength, 0.f, 1.f);
float conditioned_mask = 1.0f - conditioning_strength; float conditioned_mask = 1.0f - conditioning_strength;
latents.denoise_mask = sd::full<float>({latents.init_latent.shape()[0], latents.denoise_mask = make_ltxav_video_denoise_mask(latents.init_latent, 1.f);
latents.init_latent.shape()[1],
latents.init_latent.shape()[2],
1,
1},
1.f);
auto encode_ltxav_condition_image = [&](const sd::Tensor<float>& image, const char* name) -> sd::Tensor<float> {
auto condition_image = image.reshape({image.shape()[0],
image.shape()[1],
1,
image.shape()[2],
image.shape()[3]});
auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
if (condition_latent.empty()) {
LOG_ERROR("failed to encode LTXAV %s image", name);
}
return condition_latent;
};
auto apply_video_condition_by_latent_index = [&](const sd::Tensor<float>& condition_latent,
int64_t latent_idx,
const char* name) -> bool {
int64_t latent_frames = latents.init_latent.shape()[2];
int64_t condition_frames = condition_latent.shape()[2];
if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
name,
latent_idx,
condition_frames,
latent_frames);
return false;
}
sd::ops::slice_assign(&latents.init_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
sd::ops::fill_slice(&latents.denoise_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
return true;
};
auto apply_video_condition_by_keyframe_index = [&](const sd::Tensor<float>& keyframes, auto apply_video_condition_by_keyframe_index = [&](const sd::Tensor<float>& keyframes,
int frame_idx, int frame_idx,
@ -4345,20 +4471,30 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
}; };
if (!start_image.empty()) { if (!start_image.empty()) {
auto start_image_latent = encode_ltxav_condition_image(start_image, "init"); if (!apply_ltxav_condition_image_by_latent_index(sd_ctx,
if (start_image_latent.empty() || !apply_video_condition_by_latent_index(start_image_latent, 0, "init")) { start_image,
&latents.init_latent,
&latents.denoise_mask,
0,
"init",
conditioning_strength)) {
return std::nullopt; return std::nullopt;
} }
} }
if (!end_image.empty()) { if (!end_image.empty()) {
auto end_image_latent = encode_ltxav_condition_image(end_image, "end"); auto end_image_latent = encode_ltxav_condition_image(sd_ctx, end_image, "end");
if (end_image_latent.empty()) { if (end_image_latent.empty()) {
return std::nullopt; return std::nullopt;
} }
int frame_idx = request->frames - 1; int frame_idx = request->frames - 1;
bool ok = frame_idx == 0 ? apply_video_condition_by_latent_index(end_image_latent, 0, "end") bool ok = frame_idx == 0 ? apply_ltxav_condition_by_latent_index(&latents.init_latent,
&latents.denoise_mask,
end_image_latent,
0,
"end",
conditioned_mask)
: apply_video_condition_by_keyframe_index(end_image_latent, frame_idx, "end"); : apply_video_condition_by_keyframe_index(end_image_latent, frame_idx, "end");
if (!ok) { if (!ok) {
return std::nullopt; return std::nullopt;
@ -4639,6 +4775,175 @@ static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
return result_images; return result_images;
} }
static sd::Tensor<float> upscale_ltx_spatial_video_latent(sd_ctx_t* sd_ctx,
const char* model_path,
const sd::Tensor<float>& packed_latent,
int audio_length) {
if (sd_ctx == nullptr || sd_ctx->sd == nullptr || packed_latent.empty()) {
return {};
}
if (strlen(SAFE_STR(model_path)) == 0) {
LOG_ERROR("LTX latent spatial upscale requires a model path");
return {};
}
if (!sd_ctx->sd->ensure_backend_pair(SDBackendModule::UPSCALER)) {
return {};
}
int latent_channels = sd_ctx->sd->get_latent_channel();
sd::Tensor<float> video_latent = packed_latent;
sd::Tensor<float> audio_latent;
if (packed_latent.shape()[3] > latent_channels) {
video_latent = sd::ops::slice(packed_latent, 3, 0, latent_channels);
audio_latent = unpack_ltxav_audio_latent(packed_latent, audio_length, latent_channels);
}
LOG_INFO("LTX latent spatial upscale: latent %dx%dx%dx%d -> x2",
(int)video_latent.shape()[0],
(int)video_latent.shape()[1],
(int)video_latent.shape()[2],
(int)video_latent.shape()[3]);
sd::Tensor<float> unnormalized = sd_ctx->sd->un_normalize_ltx_video_latents(video_latent);
if (unnormalized.empty()) {
LOG_ERROR("LTX latent un-normalization failed before spatial upscale");
return {};
}
std::unique_ptr<LTXVUpsampler::LatentUpsamplerRunner> upsampler =
std::make_unique<LTXVUpsampler::LatentUpsamplerRunner>(sd_ctx->sd->backend_for(SDBackendModule::UPSCALER),
sd_ctx->sd->params_backend_for(SDBackendModule::UPSCALER));
const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram);
upsampler->set_max_graph_vram_bytes(max_graph_vram_bytes);
if (!upsampler->load_from_file(model_path, sd_ctx->sd->n_threads)) {
LOG_ERROR("load LTX latent upsampler failed");
return {};
}
sd::Tensor<float> upscaled = upsampler->compute(sd_ctx->sd->n_threads, unnormalized);
upsampler.reset();
if (upscaled.empty()) {
LOG_ERROR("LTX latent spatial upscale failed");
return {};
}
upscaled = sd_ctx->sd->normalize_ltx_video_latents(upscaled);
if (upscaled.empty()) {
LOG_ERROR("LTX latent normalization failed after spatial upscale");
return {};
}
if (!audio_latent.empty()) {
upscaled = pack_ltxav_audio_and_video_latents(upscaled, audio_latent);
}
return upscaled;
}
static bool apply_ltxv_refine_image_conditioning(sd_ctx_t* sd_ctx,
const sd_vid_gen_params_t* sd_vid_gen_params,
const GenerationRequest& request,
const ImageGenerationLatents& latents,
sd::Tensor<float>* latent,
sd::Tensor<float>* denoise_mask,
sd::Tensor<float>* video_positions) {
if (sd_ctx == nullptr || sd_ctx->sd == nullptr || sd_vid_gen_params == nullptr ||
latent == nullptr || latent->empty() || denoise_mask == nullptr || video_positions == nullptr) {
return true;
}
if (sd_vid_gen_params->init_image.data == nullptr &&
sd_vid_gen_params->end_image.data == nullptr) {
return true;
}
if (sd_ctx->sd->vae_decode_only) {
LOG_ERROR("LTXV refine image conditioning requires VAE encoder weights; create the context with vae_decode_only=false");
return false;
}
constexpr float conditioning_strength = 1.f;
int latent_channels = sd_ctx->sd->get_latent_channel();
sd::Tensor<float> video_latent = *latent;
sd::Tensor<float> audio_latent;
if (latent->shape()[3] > latent_channels) {
video_latent = sd::ops::slice(*latent, 3, 0, latent_channels);
audio_latent = unpack_ltxav_audio_latent(*latent, latents.audio_length, latent_channels);
if (audio_latent.empty()) {
LOG_ERROR("failed to unpack LTXAV audio latent before image-to-video inplace conditioning");
return false;
}
}
int image_width = static_cast<int>(video_latent.shape()[0]) * request.vae_scale_factor;
int image_height = static_cast<int>(video_latent.shape()[1]) * request.vae_scale_factor;
sd::Tensor<float> video_mask = make_ltxav_video_denoise_mask(video_latent, 1.f);
if (sd_vid_gen_params->init_image.data != nullptr) {
sd::Tensor<float> start_image = sd_image_to_tensor(sd_vid_gen_params->init_image, image_width, image_height);
if (!apply_ltxav_condition_image_by_latent_index(sd_ctx,
start_image,
&video_latent,
&video_mask,
0,
"init",
conditioning_strength)) {
return false;
}
}
if (sd_vid_gen_params->end_image.data != nullptr) {
sd::Tensor<float> end_image = sd_image_to_tensor(sd_vid_gen_params->end_image, image_width, image_height);
sd::Tensor<float> end_image_latent = encode_ltxav_condition_image(sd_ctx, end_image, "end");
if (end_image_latent.empty()) {
return false;
}
int frame_idx = request.frames - 1;
if (frame_idx == 0) {
if (!apply_ltxav_condition_by_latent_index(&video_latent,
&video_mask,
end_image_latent,
0,
"end",
1.f - conditioning_strength)) {
return false;
}
} else {
if (latents.video_conditioning_frame_count <= 0 || latents.video_target_frame_count <= 0) {
LOG_ERROR("LTXV FLF2V refine conditioning requires low-resolution keyframe conditioning metadata");
return false;
}
int64_t target_latent_frames = latents.video_target_frame_count;
if (!apply_ltxav_condition_by_latent_index(&video_latent,
&video_mask,
end_image_latent,
target_latent_frames,
"end",
1.f - conditioning_strength)) {
return false;
}
*video_positions = build_ltxv_video_positions(video_latent.shape()[0],
video_latent.shape()[1],
target_latent_frames,
end_image_latent.shape()[2],
frame_idx,
1,
request.fps,
request.vae_scale_factor,
8,
true);
}
}
if (!audio_latent.empty()) {
*latent = pack_ltxav_audio_and_video_latents(video_latent, audio_latent);
*denoise_mask = pack_ltxav_audio_and_video_denoise_mask(video_mask, video_latent, audio_latent);
} else {
*latent = std::move(video_latent);
*denoise_mask = std::move(video_mask);
}
LOG_INFO("LTXV refine image conditioning applied at %dx%d", image_width, image_height);
return true;
}
SD_API bool generate_video(sd_ctx_t* sd_ctx, SD_API bool generate_video(sd_ctx_t* sd_ctx,
const sd_vid_gen_params_t* sd_vid_gen_params, const sd_vid_gen_params_t* sd_vid_gen_params,
sd_image_t** frames_out, sd_image_t** frames_out,
@ -4659,6 +4964,23 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params; sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params;
GenerationRequest request(sd_ctx, sd_vid_gen_params); GenerationRequest request(sd_ctx, sd_vid_gen_params);
bool latent_upscale_enabled = request.hires.enabled;
GenerationRequest hires_request = request;
if (latent_upscale_enabled) {
if (!sd_version_is_ltxav(sd_ctx->sd->version)) {
LOG_ERROR("LTX latent spatial upscale is only supported for LTX video models");
return false;
}
if (request.hires.upscaler != SD_HIRES_UPSCALER_MODEL) {
LOG_ERROR("LTX latent spatial upscale currently requires hires upscaler MODEL");
return false;
}
if (strlen(SAFE_STR(request.hires.model_path)) == 0) {
LOG_ERROR("LTX latent spatial upscale is enabled but hires model path was not provided");
return false;
}
}
sd_ctx->sd->rng->manual_seed(request.seed); sd_ctx->sd->rng->manual_seed(request.seed);
sd_ctx->sd->sampler_rng->manual_seed(request.seed); sd_ctx->sd->sampler_rng->manual_seed(request.seed);
sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift); sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
@ -4670,14 +4992,22 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
return false; return false;
} }
ImageGenerationLatents latents = std::move(*latent_inputs_opt); ImageGenerationLatents latents = std::move(*latent_inputs_opt);
ImageGenerationEmbeds embeds = prepare_video_generation_embeds(sd_ctx,
sd_vid_gen_params, ImageGenerationEmbeds embeds = prepare_video_generation_embeds(sd_ctx,
request, sd_vid_gen_params,
latents); request,
LOG_INFO("generate_video %dx%dx%d", latents);
request.width, if (latent_upscale_enabled) {
request.height, LOG_INFO("generate_video %dx%dx%d -> LTX latent spatial upscale",
request.frames); request.width,
request.height,
request.frames);
} else {
LOG_INFO("generate_video %dx%dx%d",
request.width,
request.height,
request.frames);
}
int64_t latent_start = ggml_time_ms(); int64_t latent_start = ggml_time_ms();
int W = request.width / request.vae_scale_factor; int W = request.width / request.vae_scale_factor;
@ -4769,15 +5099,126 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
latents.video_positions); latents.video_positions);
int64_t sampling_end = ggml_time_ms(); int64_t sampling_end = ggml_time_ms();
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
if (final_latent.empty()) { if (final_latent.empty()) {
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
LOG_ERROR("sampling failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); LOG_ERROR("sampling failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
return false; return false;
} }
LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
if (latent_upscale_enabled) {
int64_t upscale_start = ggml_time_ms();
sd::Tensor<float> upscaled_latent = upscale_ltx_spatial_video_latent(sd_ctx,
request.hires.model_path,
final_latent,
latents.audio_length);
int64_t upscale_end = ggml_time_ms();
if (upscaled_latent.empty()) {
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
return false;
}
LOG_INFO("LTX latent spatial upscale completed, taking %.2fs",
(upscale_end - upscale_start) * 1.0f / 1000);
x_t = std::move(upscaled_latent);
hires_request.width = static_cast<int>(x_t.shape()[0]) * hires_request.vae_scale_factor;
hires_request.height = static_cast<int>(x_t.shape()[1]) * hires_request.vae_scale_factor;
if ((request.hires.target_width > 0 || request.hires.target_height > 0) &&
(request.hires.target_width != hires_request.width || request.hires.target_height != hires_request.height)) {
LOG_WARN("LTX latent spatial upsampler output is %dx%d; ignoring hires target %dx%d",
hires_request.width,
hires_request.height,
request.hires.target_width,
request.hires.target_height);
}
sd::Tensor<float> hires_denoise_mask;
sd::Tensor<float> hires_video_positions;
if (!apply_ltxv_refine_image_conditioning(sd_ctx,
sd_vid_gen_params,
hires_request,
latents,
&x_t,
&hires_denoise_mask,
&hires_video_positions)) {
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
return false;
}
noise = sd::Tensor<float>::randn_like(x_t, sd_ctx->sd->rng);
W = hires_request.width / hires_request.vae_scale_factor;
H = hires_request.height / hires_request.vae_scale_factor;
T = static_cast<int>(x_t.shape()[2]);
sample_method_t hires_sample_method = plan.sample_method;
int hires_scheduler_steps = 0;
std::vector<float> hires_sigma_sched =
make_hires_sigma_schedule(sd_ctx,
request.hires,
sd_vid_gen_params->sample_params,
hires_sample_method,
plan.sample_steps,
sd_ctx->sd->get_image_seq_len(hires_request.height, hires_request.width) * T,
&hires_scheduler_steps);
float hires_eta = resolve_eta(sd_ctx,
sd_vid_gen_params->sample_params.eta,
hires_sample_method);
LOG_DEBUG("sample(latent upscale) %dx%dx%d", W, H, T);
LOG_INFO("LTX latent spatial upscale refine: scheduler_steps=%d, denoising_strength=%.2f, sampler=%s, sigma_sched_size=%zu%s",
hires_scheduler_steps,
request.hires.denoising_strength,
sampling_methods_str[hires_sample_method],
hires_sigma_sched.size(),
request.hires.custom_sigmas_count > 0 ? ", custom_sigmas=true" : "");
sampling_start = ggml_time_ms();
final_latent = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model,
true,
x_t,
std::move(noise),
embeds.cond,
hires_request.use_uncond ? embeds.uncond : SDCondition(),
embeds.img_cond,
embeds.id_cond,
sd::Tensor<float>(),
0.f,
sd_vid_gen_params->sample_params.guidance,
hires_eta,
sd_vid_gen_params->sample_params.shifted_timestep,
hires_sample_method,
sd_ctx->sd->is_flow_denoiser(),
plan.extra_sample_args,
hires_sigma_sched,
-1,
std::vector<sd::Tensor<float>>{},
false,
hires_denoise_mask,
sd::Tensor<float>(),
hires_request.vace_strength,
latents.audio_length,
static_cast<float>(hires_request.fps),
hires_request.cache_params,
hires_video_positions);
sampling_end = ggml_time_ms();
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
if (final_latent.empty()) {
LOG_ERROR("sampling(latent upscale) failed after %.2fs",
(sampling_end - sampling_start) * 1.0f / 1000);
return false;
}
LOG_INFO("sampling(latent upscale) completed, taking %.2fs",
(sampling_end - sampling_start) * 1.0f / 1000);
} else if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
sd_audio_t* generated_audio = nullptr; sd_audio_t* generated_audio = nullptr;
if (sd_version_is_ltxav(sd_ctx->sd->version) && if (sd_version_is_ltxav(sd_ctx->sd->version) &&
latents.audio_length > 0 && latents.audio_length > 0 &&
@ -4808,7 +5249,7 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
int64_t latent_end = ggml_time_ms(); int64_t latent_end = ggml_time_ms();
LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000); LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000);
auto result = decode_video_outputs(sd_ctx, request, final_latent, num_frames_out); auto result = decode_video_outputs(sd_ctx, latent_upscale_enabled ? hires_request : request, final_latent, num_frames_out);
if (result == nullptr) { if (result == nullptr) {
free_sd_audio(generated_audio); free_sd_audio(generated_audio);
return false; return false;