feat: add more built-in highres upscalers

2026-05-08 16:28:53 +00:00 · 2026-04-23 21:58:34 +08:00 · 2026-04-23 21:58:34 +08:00 · 53e4607ac8
commit 53e4607ac8
parent c97702e105
11 changed files with 624 additions and 209 deletions
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -4,29 +4,29 @@
 usage: ./bin/sd-cli  [options]
 CLI Options:
-  -o, --output <string>       path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
+  -o, --output <string>         path to write result image to. you can use printf-style %d format specifiers for image
-                              ./output.png) (eg. output_%03d.png). For video generation, single-file outputs support .avi, .webm, and animated .webp
+                                sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs
-  --preview-path <string>     path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp
+                                support .avi, .webm, and animated .webp
-  --preview-interval <int>    interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
+  --image <string>              path to the image to inspect (for metadata mode)
-                              every step)
+  --metadata-format <string>    metadata output format, one of [text, json] (default: text)
-  --output-begin-idx <int>    starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
+  --preview-path <string>       path to write preview image to (default: ./preview.png). Multi-frame previews support
-  --image <string>            path to the image to inspect (for metadata mode)
+                                .avi, .webm, and animated .webp
-  --metadata-format <string>  metadata output format, one of [text, json] (default: text)
+  --preview-interval <int>      interval in denoising steps between consecutive updates of the image preview file
-  --canny                     apply canny preprocessor (edge detection)
+                                (default is 1, meaning updating at every step)
-  --convert-name              convert tensor name (for convert mode)
+  --output-begin-idx <int>      starting index for output image sequence, must be non-negative (default 0 if specified
-                              convert mode writes `.gguf` or `.safetensors` based on the output extension.
+                                %d in output path, 1 otherwise)
-                              `.safetensors` export currently supports f16, bf16, f32, and i32 tensor types only.
+  --canny                       apply canny preprocessor (edge detection)
-                              i32 is passthrough only; no f32 <-> i32 conversion is performed
+  --convert-name                convert tensor name (for convert mode)
-  -v, --verbose               print extra info
+  -v, --verbose                 print extra info
-  --color                     colors the logging tags according to level
+  --color                       colors the logging tags according to level
-  --taesd-preview-only        prevents usage of taesd for decoding the final image. (for use with --preview tae)
+  --taesd-preview-only          prevents usage of taesd for decoding the final image. (for use with --preview tae)
-  --preview-noisy             enables previewing noisy inputs of the models rather than the denoised outputs
+  --preview-noisy               enables previewing noisy inputs of the models rather than the denoised outputs
-  --metadata-raw              include raw hex previews for unparsed metadata payloads
+  --metadata-raw                include raw hex previews for unparsed metadata payloads
-  --metadata-brief            truncate long metadata text values in text output
+  --metadata-brief              truncate long metadata text values in text output
-  --metadata-all              include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments
+  --metadata-all                include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments
-  -M, --mode                  run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen
+  -M, --mode                    run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen
-  --preview                   preview method. must be one of the following [none, proj, tae, vae] (default is none)
+  --preview                     preview method. must be one of the following [none, proj, tae, vae] (default is none)
-  -h, --help                  show this help message and exit
+  -h, --help                    show this help message and exit
 Context Options:
  -m, --model <string>                     path to full model
@ -34,7 +34,8 @@ Context Options:
  --clip_g <string>                        path to the clip-g text encoder
  --clip_vision <string>                   path to the clip-vision encoder
  --t5xxl <string>                         path to the t5xxl text encoder
-  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
+  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
                                           mistral-small3.2 for flux2, ...)
  --llm_vision <string>                    path to the llm vit
  --qwen2vl <string>                       alias of --llm. Deprecated.
  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
@ -46,16 +47,16 @@ Context Options:
  --control-net <string>                   path to control net model
  --embd-dir <string>                      embeddings directory
  --lora-model-dir <string>                lora model directory
  --hires-upscalers-dir <string>           highres fix upscaler model directory
  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
  --photo-maker <string>                   path to PHOTOMAKER model
  --upscale-model <string>                 path to esrgan model.
-  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
+  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0,
-                                           CPU physical cores
+                                           then threads will be set to the number of CPU physical cores
  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
  --vae-tiling                             process vae in tiles to reduce memory usage
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
-  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
+  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
                                           when needed
  --mmap                                   whether to memory-map model
  --control-net-cpu                        keep controlnet in cpu (for low vram)
  --clip-on-cpu                            keep clip in cpu (for low vram)
@ -70,20 +71,19 @@ Context Options:
  --chroma-disable-dit-mask                disable dit mask for chroma
  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
  --chroma-enable-t5-mask                  enable t5 mask for chroma
-  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
+  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
-                                           type of the weight file
+                                           q4_K). If not specified, the default is the type of the weight file
  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
-  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
+  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
-  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
+                                           flux2_flow]
-                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
+  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is
-                                           immediately will be used.The immediately mode may have precision and
+                                           auto. In auto mode, if the model weights contain any quantized parameters,
-                                           compatibility issues with quantized parameters, but it usually offers faster inference
+                                           the at_runtime mode will be used; otherwise, immediately will be used.The
-                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
+                                           immediately mode may have precision and compatibility issues with quantized
-                                           other hand, is exactly the opposite.
+                                           parameters, but it usually offers faster inference speed and, in some cases,
-  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
+                                           lower memory usage. The at_runtime mode, on the other hand, is exactly the
-  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
+                                           opposite.
                                           (overrides --vae-tile-size)
 Generation Options:
  -p, --prompt <string>                    the prompt to render
@ -92,69 +92,99 @@ Generation Options:
  --end-img <string>                       path to the end image, required by flf2v
  --mask <string>                          path to the mask image
  --control-image <string>                 path to control image, control net
-  --control-video <string>                 path to control video frames, It must be a directory path. The video frames inside should be stored as images in
+  --control-video <string>                 path to control video frames, It must be a directory path. The video frames
-                                           lexicographical (character) order. For example, if the control video path is
+                                           inside should be stored as images in lexicographical (character) order. For
-                                           `frames`, the directory contain images such as 00.png, 01.png, ... etc.
+                                           example, if the control video path is `frames`, the directory contain images
                                           such as 00.png, 01.png, ... etc.
  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
  --hires-upscaler <string>                highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
                                           (nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
                                           antialiased), or a model name under --hires-upscalers-dir (default: Latent)
  -H, --height <int>                       image height, in pixel space (default: 512)
  -W, --width <int>                        image width, in pixel space (default: 512)
  --steps <int>                            number of sample steps (default: 20)
  --high-noise-steps <int>                 (high noise) number of sample steps (default: -1 = auto)
-  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
+  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
-                                           will be 1 for SD1.x, 2 for SD2.x
+                                           (default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
  -b, --batch-count <int>                  batch count
  --video-frames <int>                     video frames (default: 1)
  --fps <int>                              fps (default: 24)
-  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
+  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for
-                                           NitroSD-Vibrant
+                                           NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
  --upscale-tile-size <int>                tile size for ESRGAN upscaling (default: 128)
  --hires-width <int>                      highres fix target width, 0 to use --hires-scale (default: 0)
  --hires-height <int>                     highres fix target height, 0 to use --hires-scale (default: 0)
  --hires-steps <int>                      highres fix second pass sample steps, 0 to reuse --steps (default: 0)
  --hires-upscale-tile-size <int>          highres fix upscaler tile size, reserved for model-backed upscalers (default:
                                           128)
  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
-  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
+  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same
                                           as --cfg-scale)
  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
-  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
+  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
-                                           medium
+                                           disabled, a value of 2.5 is nice for sd3.5 medium
  --skip-layer-start <float>               SLG enabling point (default: 0.01)
  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
-  --eta <float>                            noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
+  --eta <float>                            noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
                                           res_2s; 1 for euler_a, er_sde and dpm++2s_a)
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
-  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
+  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models
-  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
+                                           (default: same as --cfg-scale)
-  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
+  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input
                                           (default: 3.5)
  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
                                           0)
  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
-  --high-noise-eta <float>                 (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
+  --high-noise-eta <float>                 (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
                                           res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
  --strength <float>                       strength for noising/unnoising (default: 0.75)
-  --pm-style-strength <float>
+  --pm-style-strength <float>              
-  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
+  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full
-  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
+                                           destruction of information in init image
  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
                                           `--high-noise-steps` is set to -1
  --vace-strength <float>                  wan vace strength
-  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
+  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
  --hires-scale <float>                    highres fix scale when target size is not set (default: 2.0)
  --hires-denoising-strength <float>       highres fix second pass denoising strength (default: 0.7)
  --increase-ref-index                     automatically increase the indices of references images based on the order
                                           they are listed (starting with 1).
  --disable-auto-resize-ref-image          disable auto resize of ref images
  --disable-image-metadata                 do not embed generation metadata on image files
  --vae-tiling                             process vae in tiles to reduce memory usage
  --hires                                  enable highres fix
  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
-  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
+  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
-                                           tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a
+                                           dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
-                                           otherwise)
+                                           er_sde] (default: euler for Flux/SD3/Wan, euler_a otherwise)
-  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
+  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
-                                           ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan,
+                                           dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
-                                           euler_a otherwise
+                                           res_2s, er_sde] default: euler for Flux/SD3/Wan, euler_a otherwise
-  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
+  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
-                                           kl_optimal, lcm, bong_tangent], default: discrete
+                                           smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default:
-  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
+                                           discrete
  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g.,
                                           "14.61,7.8,3.5,0.0").
  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
-  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
+  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET),
-                                           'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
+                                           'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
                                           Chebyshev+Taylor forecasting)
  --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
-                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
+                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
-                                           spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
+                                           Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
-                                           "threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
+                                           Examples: "threshold=0.25" or "threshold=1.5,reset=0"
-  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
+  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
                                           "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
  --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size
                                           if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
 ```
 Metadata mode inspects PNG/JPEG container metadata without loading any model:
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -690,7 +690,10 @@ int main(int argc, const char* argv[]) {
        vae_decode_only = false;
    }
-    if (gen_params.hires_enabled && !gen_params.hires_upscaler_model_path.empty()) {
+    if (gen_params.hires_enabled &&
        (gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_MODEL ||
         gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_LANCZOS ||
         gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_NEAREST)) {
        vae_decode_only = false;
    }
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@ -107,47 +107,60 @@ static bool is_absolute_path(const std::string& p) {
 std::string ArgOptions::wrap_text(const std::string& text, size_t width, size_t indent) {
    std::ostringstream oss;
    size_t line_len = 0;
    size_t pos      = 0;
    size_t line_len = 0;
    while (pos < text.size()) {
        // Preserve manual newlines
        if (text[pos] == '\n') {
            oss << '\n'
                << std::string(indent, ' ');
-            line_len = indent;
+            line_len = 0;
            ++pos;
            continue;
        }
-        // Add the character
+        if (std::isspace(static_cast<unsigned char>(text[pos]))) {
-        oss << text[pos];
+            ++pos;
-        ++line_len;
+            continue;
-        ++pos;
+        }
-        // If the current line exceeds width, try to break at the last space
+        size_t word_start = pos;
-        if (line_len >= width) {
+        while (pos < text.size() &&
-            std::string current = oss.str();
+               text[pos] != '\n' &&
-            size_t back         = current.size();
+               !std::isspace(static_cast<unsigned char>(text[pos]))) {
            ++pos;
        }
-            // Find the last space (for a clean break)
+        std::string word = text.substr(word_start, pos - word_start);
-            while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n')
+        while (!word.empty()) {
-                --back;
+            size_t separator_len = line_len == 0 ? 0 : 1;
-
+            if (line_len + separator_len + word.size() <= width) {
-            // If found a space to break on
+                if (separator_len > 0) {
-            if (back > 0 && current[back - 1] != '\n') {
+                    oss << ' ';
-                std::string before = current.substr(0, back - 1);
+                    ++line_len;
-                std::string after  = current.substr(back);
+                }
-                oss.str("");
+                oss << word;
-                oss.clear();
+                line_len += word.size();
-                oss << before << "\n"
+                word.clear();
-                    << std::string(indent, ' ') << after;
+                continue;
-            } else {
+            }
-                // If no space found, just break at width
+
-                oss << "\n"
+            if (line_len > 0) {
-                    << std::string(indent, ' ');
+                oss << '\n'
                    << std::string(indent, ' ');
                line_len = 0;
                continue;
            }
            size_t chunk_len = std::min(width, word.size());
            oss << word.substr(0, chunk_len);
            line_len = chunk_len;
            word.erase(0, chunk_len);
            if (!word.empty()) {
                oss << '\n'
                    << std::string(indent, ' ');
                line_len = 0;
            }
            line_len = indent;
        }
    }
@ -783,7 +796,9 @@ ArgOptions SDGenerationParams::get_options() {
         &pm_id_embed_path},
        {"",
         "--hires-upscaler",
-         "highres fix upscaler, Latent (nearest) or a model name/path under --hires-upscalers-dir (default: Latent (nearest))",
+         "highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
         "Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
         "under --hires-upscalers-dir (default: Latent)",
         &hires_upscaler},
    };
@ -1918,7 +1933,7 @@ bool SDGenerationParams::resolve(const std::string& lora_model_dir, const std::s
    hires_upscaler_model_path.clear();
    if (hires_enabled) {
        if (hires_upscaler.empty()) {
-            hires_upscaler = "Latent (nearest)";
+            hires_upscaler = "Latent";
        }
        resolved_hires_upscaler = str_to_sd_hires_upscaler(hires_upscaler.c_str());
        if (resolved_hires_upscaler == SD_HIRES_UPSCALER_NONE) {
--- a/examples/common/common.h
+++ b/examples/common/common.h
@ -192,7 +192,7 @@ struct SDGenerationParams {
    int upscale_tile_size = 128;
    bool hires_enabled         = false;
-    std::string hires_upscaler = "Latent (nearest)";
+    std::string hires_upscaler = "Latent";
    std::string hires_upscaler_model_path;
    float hires_scale              = 2.f;
    int hires_width                = 0;
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -123,11 +123,11 @@ In this case, the server will load and serve the specified `index.html` file ins
 usage: ./bin/sd-server  [options]
 Svr Options:
-  -l, --listen-ip <string>      server listen ip (default: 127.0.0.1)        
+  -l, --listen-ip <string>      server listen ip (default: 127.0.0.1)
  --serve-html-path <string>    path to HTML file to serve at root (optional)
  --listen-port <int>           server listen port (default: 1234)
  -v, --verbose                 print extra info
-  --color                       colors the logging tags according to level   
+  --color                       colors the logging tags according to level
  -h, --help                    show this help message and exit
 Context Options:
@ -136,7 +136,8 @@ Context Options:
  --clip_g <string>                        path to the clip-g text encoder
  --clip_vision <string>                   path to the clip-vision encoder
  --t5xxl <string>                         path to the t5xxl text encoder
-  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
+  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
                                           mistral-small3.2 for flux2, ...)
  --llm_vision <string>                    path to the llm vit
  --qwen2vl <string>                       alias of --llm. Deprecated.
  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
@ -148,16 +149,16 @@ Context Options:
  --control-net <string>                   path to control net model
  --embd-dir <string>                      embeddings directory
  --lora-model-dir <string>                lora model directory
  --hires-upscalers-dir <string>           highres fix upscaler model directory
  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
  --photo-maker <string>                   path to PHOTOMAKER model
  --upscale-model <string>                 path to esrgan model.
-  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
+  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0,
-                                           CPU physical cores
+                                           then threads will be set to the number of CPU physical cores
  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
  --vae-tiling                             process vae in tiles to reduce memory usage
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
-  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
+  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
                                           when needed
  --mmap                                   whether to memory-map model
  --control-net-cpu                        keep controlnet in cpu (for low vram)
  --clip-on-cpu                            keep clip in cpu (for low vram)
@ -172,20 +173,19 @@ Context Options:
  --chroma-disable-dit-mask                disable dit mask for chroma
  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
  --chroma-enable-t5-mask                  enable t5 mask for chroma
-  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
+  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
-                                           type of the weight file
+                                           q4_K). If not specified, the default is the type of the weight file
  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
-  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
+  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
-  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
+                                           flux2_flow]
-                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
+  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is
-                                           immediately will be used.The immediately mode may have precision and
+                                           auto. In auto mode, if the model weights contain any quantized parameters,
-                                           compatibility issues with quantized parameters, but it usually offers faster inference
+                                           the at_runtime mode will be used; otherwise, immediately will be used.The
-                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
+                                           immediately mode may have precision and compatibility issues with quantized
-                                           other hand, is exactly the opposite.
+                                           parameters, but it usually offers faster inference speed and, in some cases,
-  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
+                                           lower memory usage. The at_runtime mode, on the other hand, is exactly the
-  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
+                                           opposite.
                                           (overrides --vae-tile-size)
 Default Generation Options:
  -p, --prompt <string>                    the prompt to render
@ -194,65 +194,97 @@ Default Generation Options:
  --end-img <string>                       path to the end image, required by flf2v
  --mask <string>                          path to the mask image
  --control-image <string>                 path to control image, control net
-  --control-video <string>                 path to control video frames, It must be a directory path. The video frames inside should be stored as images in
+  --control-video <string>                 path to control video frames, It must be a directory path. The video frames
-                                           lexicographical (character) order. For example, if the control video path is
+                                           inside should be stored as images in lexicographical (character) order. For
-                                           `frames`, the directory contain images such as 00.png, 01.png, ... etc.
+                                           example, if the control video path is `frames`, the directory contain images
                                           such as 00.png, 01.png, ... etc.
  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
  --hires-upscaler <string>                highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
                                           (nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
                                           antialiased), or a model name under --hires-upscalers-dir (default: Latent)
  -H, --height <int>                       image height, in pixel space (default: 512)
  -W, --width <int>                        image width, in pixel space (default: 512)
  --steps <int>                            number of sample steps (default: 20)
  --high-noise-steps <int>                 (high noise) number of sample steps (default: -1 = auto)
-  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
+  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
-                                           will be 1 for SD1.x, 2 for SD2.x
+                                           (default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
  -b, --batch-count <int>                  batch count
  --video-frames <int>                     video frames (default: 1)
  --fps <int>                              fps (default: 24)
-  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
+  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for
-                                           NitroSD-Vibrant
+                                           NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
  --upscale-tile-size <int>                tile size for ESRGAN upscaling (default: 128)
  --hires-width <int>                      highres fix target width, 0 to use --hires-scale (default: 0)
  --hires-height <int>                     highres fix target height, 0 to use --hires-scale (default: 0)
  --hires-steps <int>                      highres fix second pass sample steps, 0 to reuse --steps (default: 0)
  --hires-upscale-tile-size <int>          highres fix upscaler tile size, reserved for model-backed upscalers (default:
                                           128)
  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
-  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
+  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same
                                           as --cfg-scale)
  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
-  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
+  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
-                                           medium
+                                           disabled, a value of 2.5 is nice for sd3.5 medium
  --skip-layer-start <float>               SLG enabling point (default: 0.01)
  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
-  --eta <float>                            noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
+  --eta <float>                            noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
                                           res_2s; 1 for euler_a, er_sde and dpm++2s_a)
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
-  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
+  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models
-  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
+                                           (default: same as --cfg-scale)
-  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
+  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input
                                           (default: 3.5)
  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
                                           0)
  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
-  --high-noise-eta <float>                 (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
+  --high-noise-eta <float>                 (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
                                           res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
  --strength <float>                       strength for noising/unnoising (default: 0.75)
-  --pm-style-strength <float>
+  --pm-style-strength <float>              
-  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
+  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full
-  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
+                                           destruction of information in init image
  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
                                           `--high-noise-steps` is set to -1
  --vace-strength <float>                  wan vace strength
-  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
+  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
  --hires-scale <float>                    highres fix scale when target size is not set (default: 2.0)
  --hires-denoising-strength <float>       highres fix second pass denoising strength (default: 0.7)
  --increase-ref-index                     automatically increase the indices of references images based on the order
                                           they are listed (starting with 1).
  --disable-auto-resize-ref-image          disable auto resize of ref images
  --disable-image-metadata                 do not embed generation metadata on image files
  --vae-tiling                             process vae in tiles to reduce memory usage
  --hires                                  enable highres fix
  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
-  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
+  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
-                                           tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a
+                                           dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
-                                           otherwise)
+                                           er_sde] (default: euler for Flux/SD3/Wan, euler_a otherwise)
-  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
+  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
-                                           ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan,
+                                           dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
-                                           euler_a otherwise
+                                           res_2s, er_sde] default: euler for Flux/SD3/Wan, euler_a otherwise
-  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
+  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
-                                           kl_optimal, lcm, bong_tangent], default: discrete
+                                           smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default:
-  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
+                                           discrete
  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g.,
                                           "14.61,7.8,3.5,0.0").
  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
-  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
+  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET),
                                           'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
                                           Chebyshev+Taylor forecasting)
  --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
-                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
+                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
-                                           "threshold=0.25" or "threshold=1.5,reset=0"
+                                           Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
-  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
+                                           Examples: "threshold=0.25" or "threshold=1.5,reset=0"
  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
                                           "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
  --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size
                                           if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
 ```
--- a/examples/server/api.md
+++ b/examples/server/api.md
@ -219,7 +219,7 @@ Currently supported request fields:
 | `lora` | `array<object>` | Structured LoRA list |
 | `extra_images` | `array<string>` | Base64 or data URL images |
 | `enable_hr` | `boolean` | Enable highres fix for `txt2img` |
-| `hr_upscaler` | `string` | `Latent (nearest)` or an upscaler model name from `/sdapi/v1/upscalers` |
+| `hr_upscaler` | `string` | `Lanczos`, `Nearest`, a latent mode such as `Latent (nearest-exact)`, or an upscaler model name from `/sdapi/v1/upscalers` |
 | `hr_scale` | `number` | Highres scale when resize target is not set |
 | `hr_resize_x` | `integer` | Highres target width, `0` to use scale |
 | `hr_resize_y` | `integer` | Highres target height, `0` to use scale |
@ -303,6 +303,8 @@ Built-in entries include `None`, `Lanczos`, and `Nearest`. Model-backed entries
 | --- | --- | --- |
 | `[].name` | `string` | WebUI-compatible latent upscale mode name |
 Built-in latent modes include `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`.
 `GET /sdapi/v1/samplers`
 | Field | Type | Notes |
@ -462,7 +464,7 @@ Shared nested fields:
 | --- | --- | --- |
 | `upscalers[].name` | `string` | Built-in name or model stem; use this value in `hires.upscaler` |
-Built-in entries include `None` and `Latent (nearest)`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned.
+Built-in entries include `None`, `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned.
 `limits`
@ -677,7 +679,7 @@ Example:
  "lora": [],
  "hires": {
    "enabled": false,
-    "upscaler": "Latent (nearest)",
+    "upscaler": "Latent",
    "scale": 2.0,
    "target_width": 0,
    "target_height": 0,
@ -804,7 +806,7 @@ Other native fields:
 | `scm_mask` | `string` |
 | `scm_policy_dynamic` | `boolean` |
-For `hires.upscaler`, use `Latent (nearest)` for latent upscale or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory.
+For `hires.upscaler`, use `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, `Latent (bicubic antialiased)`, or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory.
 HTTP-only output fields:
--- a/examples/server/routes_sdapi.cpp
+++ b/examples/server/routes_sdapi.cpp
@ -381,6 +381,8 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
        json result = json::array();
        result.push_back(make_builtin("None"));
        result.push_back(make_builtin("Lanczos"));
        result.push_back(make_builtin("Nearest"));
        {
            std::lock_guard<std::mutex> lock(*runtime->upscaler_mutex);
@ -400,7 +402,12 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
    svr.Get("/sdapi/v1/latent-upscale-modes", [](const httplib::Request&, httplib::Response& res) {
        json result = json::array({
            {{"name", "Latent"}},
            {{"name", "Latent (nearest)"}},
            {{"name", "Latent (nearest-exact)"}},
            {{"name", "Latent (antialiased)"}},
            {{"name", "Latent (bicubic)"}},
            {{"name", "Latent (bicubic antialiased)"}},
        });
        res.set_content(result.dump(), "application/json");
    });
--- a/examples/server/routes_sdcpp.cpp
+++ b/examples/server/routes_sdcpp.cpp
@ -227,9 +227,30 @@ static json make_capabilities_json(ServerRuntime& runtime) {
    available_upscalers.push_back({
        {"name", "None"},
    });
    available_upscalers.push_back({
        {"name", "Lanczos"},
    });
    available_upscalers.push_back({
        {"name", "Nearest"},
    });
    available_upscalers.push_back({
        {"name", "Latent"},
    });
    available_upscalers.push_back({
        {"name", "Latent (nearest)"},
    });
    available_upscalers.push_back({
        {"name", "Latent (nearest-exact)"},
    });
    available_upscalers.push_back({
        {"name", "Latent (antialiased)"},
    });
    available_upscalers.push_back({
        {"name", "Latent (bicubic)"},
    });
    available_upscalers.push_back({
        {"name", "Latent (bicubic antialiased)"},
    });
    {
        std::lock_guard<std::mutex> lock(*runtime.upscaler_mutex);
        for (const auto& entry : *runtime.upscaler_cache) {
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -291,7 +291,14 @@ typedef struct {
 enum sd_hires_upscaler_t {
    SD_HIRES_UPSCALER_NONE,
    SD_HIRES_UPSCALER_LATENT,
    SD_HIRES_UPSCALER_LATENT_NEAREST,
    SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT,
    SD_HIRES_UPSCALER_LATENT_ANTIALIASED,
    SD_HIRES_UPSCALER_LATENT_BICUBIC,
    SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED,
    SD_HIRES_UPSCALER_LANCZOS,
    SD_HIRES_UPSCALER_NEAREST,
    SD_HIRES_UPSCALER_MODEL,
    SD_HIRES_UPSCALER_COUNT,
 };
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -2116,12 +2116,19 @@ enum lora_apply_mode_t str_to_lora_apply_mode(const char* str) {
 const char* hires_upscaler_to_str[] = {
    "None",
    "Latent",
    "Latent (nearest)",
    "Latent (nearest-exact)",
    "Latent (antialiased)",
    "Latent (bicubic)",
    "Latent (bicubic antialiased)",
    "Lanczos",
    "Nearest",
    "Model",
 };
 const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler) {
-    if (upscaler < SD_HIRES_UPSCALER_COUNT) {
+    if (upscaler >= SD_HIRES_UPSCALER_NONE && upscaler < SD_HIRES_UPSCALER_COUNT) {
        return hires_upscaler_to_str[upscaler];
    }
    return NONE_STR;
@ -2167,7 +2174,7 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) {
 void sd_hires_params_init(sd_hires_params_t* hires_params) {
    *hires_params                    = {};
    hires_params->enabled            = false;
-    hires_params->upscaler           = SD_HIRES_UPSCALER_LATENT_NEAREST;
+    hires_params->upscaler           = SD_HIRES_UPSCALER_LATENT;
    hires_params->model_path         = nullptr;
    hires_params->scale              = 2.0f;
    hires_params->target_width       = 0;
@ -2658,7 +2665,7 @@ struct GenerationRequest {
            hires.enabled = false;
            return;
        }
-        if (hires.upscaler < SD_HIRES_UPSCALER_NONE && hires.upscaler >= SD_HIRES_UPSCALER_COUNT) {
+        if (hires.upscaler < SD_HIRES_UPSCALER_NONE || hires.upscaler >= SD_HIRES_UPSCALER_COUNT) {
            LOG_WARN("hires upscaler '%d' is invalid, disabling hires", hires.upscaler);
            hires.enabled = false;
            return;
@ -3252,55 +3259,123 @@ static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
                                              const sd::Tensor<float>& latent,
                                              const GenerationRequest& request,
                                              UpscalerGGML* upscaler) {
-    if (request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST) {
+    auto get_hires_latent_target_shape = [&]() {
        std::vector<int64_t> target_shape = latent.shape();
        if (target_shape.size() < 2) {
-            LOG_ERROR("latent has invalid shape for hires upscale");
+            target_shape.clear();
-            return {};
+            return target_shape;
        }
        target_shape[0] = request.hires.target_width / request.vae_scale_factor;
        target_shape[1] = request.hires.target_height / request.vae_scale_factor;
        return target_shape;
    };
-        LOG_INFO("hires latent upscale %" PRId64 "x%" PRId64 " -> %" PRId64 "x%" PRId64,
+    if (request.hires.upscaler == SD_HIRES_UPSCALER_LATENT ||
        request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST ||
        request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT ||
        request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_ANTIALIASED ||
        request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_BICUBIC ||
        request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED) {
        std::vector<int64_t> target_shape = get_hires_latent_target_shape();
        if (target_shape.empty()) {
            LOG_ERROR("latent has invalid shape for hires upscale");
            return {};
        }
        sd::ops::InterpolateMode mode = sd::ops::InterpolateMode::Nearest;
        bool antialias                = false;
        switch (request.hires.upscaler) {
            case SD_HIRES_UPSCALER_LATENT:
                mode = sd::ops::InterpolateMode::Bilinear;
                break;
            case SD_HIRES_UPSCALER_LATENT_NEAREST:
                mode = sd::ops::InterpolateMode::Nearest;
                break;
            case SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT:
                mode = sd::ops::InterpolateMode::NearestExact;
                break;
            case SD_HIRES_UPSCALER_LATENT_ANTIALIASED:
                mode      = sd::ops::InterpolateMode::Bilinear;
                antialias = true;
                break;
            case SD_HIRES_UPSCALER_LATENT_BICUBIC:
                mode = sd::ops::InterpolateMode::Bicubic;
                break;
            case SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED:
                mode      = sd::ops::InterpolateMode::Bicubic;
                antialias = true;
                break;
            default:
                break;
        }
        LOG_INFO("hires %s upscale %" PRId64 "x%" PRId64 " -> %" PRId64 "x%" PRId64,
                 sd_hires_upscaler_name(request.hires.upscaler),
                 latent.shape()[0],
                 latent.shape()[1],
                 target_shape[0],
                 target_shape[1]);
-        return sd::ops::interpolate(latent, target_shape, sd::ops::InterpolateMode::Nearest);
+
-    } else if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
+        return sd::ops::interpolate(latent, target_shape, mode, false, antialias);
-        if (upscaler == nullptr) {
+    } else if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL ||
-            LOG_ERROR("hires model upscaler context is null");
+               request.hires.upscaler == SD_HIRES_UPSCALER_LANCZOS ||
               request.hires.upscaler == SD_HIRES_UPSCALER_NEAREST) {
        if (sd_ctx->sd->vae_decode_only) {
            LOG_ERROR("hires %s upscaler requires VAE encoder weights; create the context with vae_decode_only=false",
                      sd_hires_upscaler_name(request.hires.upscaler));
            return {};
        }
-        if (sd_ctx->sd->vae_decode_only) {
+        if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL && upscaler == nullptr) {
-            LOG_ERROR("hires model upscaler requires VAE encoder weights; create the context with vae_decode_only=false");
+            LOG_ERROR("hires model upscaler context is null");
            return {};
        }
        sd::Tensor<float> decoded = sd_ctx->sd->decode_first_stage(latent);
        if (decoded.empty()) {
-            LOG_ERROR("decode_first_stage failed before hires model upscale");
+            LOG_ERROR("decode_first_stage failed before hires %s upscale",
                      sd_hires_upscaler_name(request.hires.upscaler));
            return {};
        }
-        sd::Tensor<float> upscaled_tensor = upscaler->upscale_tensor(decoded);
+        sd::Tensor<float> upscaled_tensor;
-        if (upscaled_tensor.empty()) {
+        if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
-            LOG_ERROR("hires model upscale failed");
+            upscaled_tensor = upscaler->upscale_tensor(decoded);
-            return {};
+            if (upscaled_tensor.empty()) {
-        }
+                LOG_ERROR("hires model upscale failed");
                return {};
            }
-        if (upscaled_tensor.shape()[0] != request.hires.target_width ||
+            if (upscaled_tensor.shape()[0] != request.hires.target_width ||
-            upscaled_tensor.shape()[1] != request.hires.target_height) {
+                upscaled_tensor.shape()[1] != request.hires.target_height) {
-            upscaled_tensor = sd::ops::interpolate(upscaled_tensor,
+                upscaled_tensor = sd::ops::interpolate(upscaled_tensor,
                                                       {request.hires.target_width,
                                                        request.hires.target_height,
                                                        upscaled_tensor.shape()[2],
                                                        upscaled_tensor.shape()[3]});
            }
        } else {
            sd::ops::InterpolateMode mode = request.hires.upscaler == SD_HIRES_UPSCALER_LANCZOS
                                                ? sd::ops::InterpolateMode::Lanczos
                                                : sd::ops::InterpolateMode::Nearest;
            LOG_INFO("hires %s image upscale %" PRId64 "x%" PRId64 " -> %dx%d",
                     sd_hires_upscaler_name(request.hires.upscaler),
                     decoded.shape()[0],
                     decoded.shape()[1],
                     request.hires.target_width,
                     request.hires.target_height);
            upscaled_tensor = sd::ops::interpolate(decoded,
                                                   {request.hires.target_width,
                                                    request.hires.target_height,
-                                                    upscaled_tensor.shape()[2],
+                                                    decoded.shape()[2],
-                                                    upscaled_tensor.shape()[3]});
+                                                    decoded.shape()[3]},
                                                   mode);
            upscaled_tensor = sd::ops::clamp(upscaled_tensor, 0.0f, 1.0f);
        }
        sd::Tensor<float> upscaled_latent = sd_ctx->sd->encode_first_stage(upscaled_tensor);
        if (upscaled_latent.empty()) {
-            LOG_ERROR("encode_first_stage failed after hires model upscale");
+            LOG_ERROR("encode_first_stage failed after hires %s upscale",
                      sd_hires_upscaler_name(request.hires.upscaler));
        }
        return upscaled_latent;
    }
--- a/src/tensor.hpp
+++ b/src/tensor.hpp
@ -815,11 +815,202 @@ namespace sd {
    namespace ops {
        enum class InterpolateMode {
            Nearest,
            NearestExact,
            NearestMax,
            NearestMin,
            NearestAvg,
            Bilinear,
            Bicubic,
            Lanczos,
        };
        inline bool is_nearest_like_interpolate_mode(InterpolateMode mode) {
            return mode == InterpolateMode::Nearest ||
                   mode == InterpolateMode::NearestExact ||
                   mode == InterpolateMode::NearestMax ||
                   mode == InterpolateMode::NearestMin ||
                   mode == InterpolateMode::NearestAvg;
        }
        inline bool is_2d_filter_interpolate_mode(InterpolateMode mode) {
            return mode == InterpolateMode::Bilinear ||
                   mode == InterpolateMode::Bicubic ||
                   mode == InterpolateMode::Lanczos;
        }
        inline int64_t nearest_exact_interpolate_index(int64_t output_index,
                                                       int64_t input_size,
                                                       int64_t output_size) {
            const double scale  = static_cast<double>(input_size) / static_cast<double>(output_size);
            const double center = (static_cast<double>(output_index) + 0.5) * scale - 0.5;
            return std::min(std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0), input_size - 1);
        }
        inline double linear_interpolate_weight(double x) {
            x = std::abs(x);
            return x < 1.0 ? 1.0 - x : 0.0;
        }
        inline double cubic_interpolate_weight(double x) {
            constexpr double a = -0.75;  // Match PyTorch bicubic interpolation.
            x                  = std::abs(x);
            if (x <= 1.0) {
                return ((a + 2.0) * x - (a + 3.0)) * x * x + 1.0;
            }
            if (x < 2.0) {
                return ((a * x - 5.0 * a) * x + 8.0 * a) * x - 4.0 * a;
            }
            return 0.0;
        }
        inline double sinc(double x) {
            constexpr double pi = 3.14159265358979323846;
            if (std::abs(x) < 1e-12) {
                return 1.0;
            }
            const double pix = pi * x;
            return std::sin(pix) / pix;
        }
        inline double lanczos_interpolate_weight(double x) {
            constexpr double radius = 3.0;
            x                       = std::abs(x);
            if (x >= radius) {
                return 0.0;
            }
            return sinc(x) * sinc(x / radius);
        }
        struct InterpolateContributor {
            int64_t index;
            double weight;
        };
        inline std::vector<std::vector<InterpolateContributor>> make_interpolate_contributors(
            int64_t input_size,
            int64_t output_size,
            InterpolateMode mode,
            bool antialias) {
            std::vector<std::vector<InterpolateContributor>> contributors(static_cast<size_t>(output_size));
            const double scale        = static_cast<double>(input_size) / static_cast<double>(output_size);
            const double filter_scale = antialias ? std::max(1.0, scale) : 1.0;
            for (int64_t out = 0; out < output_size; ++out) {
                const double center = (static_cast<double>(out) + 0.5) * scale - 0.5;
                int64_t start       = 0;
                int64_t end         = 0;
                if (mode == InterpolateMode::Bilinear) {
                    const double support = filter_scale;
                    start                = static_cast<int64_t>(std::ceil(center - support));
                    end                  = static_cast<int64_t>(std::floor(center + support));
                } else if (mode == InterpolateMode::Bicubic) {
                    const double support = 2.0 * filter_scale;
                    start                = static_cast<int64_t>(std::ceil(center - support));
                    end                  = static_cast<int64_t>(std::floor(center + support));
                } else if (mode == InterpolateMode::Lanczos) {
                    const double support = 3.0 * filter_scale;
                    start                = static_cast<int64_t>(std::ceil(center - support));
                    end                  = static_cast<int64_t>(std::floor(center + support));
                } else {
                    tensor_throw_invalid_argument("Unsupported 2D filter interpolate mode: mode=" +
                                                  std::to_string(static_cast<int>(mode)));
                }
                double weight_sum                                      = 0.0;
                std::vector<InterpolateContributor>& axis_contributors = contributors[static_cast<size_t>(out)];
                axis_contributors.reserve(static_cast<size_t>(end - start + 1));
                for (int64_t in = start; in <= end; ++in) {
                    double weight = 0.0;
                    if (mode == InterpolateMode::Bilinear) {
                        weight = linear_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
                    } else if (mode == InterpolateMode::Bicubic) {
                        weight = cubic_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
                    } else {
                        weight = lanczos_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
                    }
                    if (weight == 0.0) {
                        continue;
                    }
                    const int64_t clamped_index = std::min(std::max<int64_t>(in, 0), input_size - 1);
                    axis_contributors.push_back({clamped_index, weight});
                    weight_sum += weight;
                }
                if ((antialias || mode == InterpolateMode::Lanczos) &&
                    std::abs(weight_sum) > 1e-12) {
                    for (auto& contributor : axis_contributors) {
                        contributor.weight /= weight_sum;
                    }
                }
                if (axis_contributors.empty()) {
                    const int64_t nearest = std::min(
                        std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0),
                        input_size - 1);
                    axis_contributors.push_back({nearest, 1.0});
                }
            }
            return contributors;
        }
        template <typename T>
        inline Tensor<T> interpolate_2d_filter(const Tensor<T>& input,
                                               const std::vector<int64_t>& output_shape,
                                               InterpolateMode mode,
                                               bool antialias) {
            if (input.dim() < 2) {
                tensor_throw_invalid_argument("2D filter interpolate requires rank >= 2: input_shape=" +
                                              tensor_shape_to_string(input.shape()) + ", output_shape=" +
                                              tensor_shape_to_string(output_shape));
            }
            for (size_t i = 2; i < output_shape.size(); ++i) {
                if (input.shape()[i] != output_shape[i]) {
                    tensor_throw_invalid_argument("2D filter interpolate only supports resizing dimensions 0 and 1: input_shape=" +
                                                  tensor_shape_to_string(input.shape()) + ", output_shape=" +
                                                  tensor_shape_to_string(output_shape));
                }
            }
            Tensor<T> output(output_shape);
            const int64_t input_width   = input.shape()[0];
            const int64_t input_height  = input.shape()[1];
            const int64_t output_width  = output_shape[0];
            const int64_t output_height = output_shape[1];
            const int64_t input_plane   = input_width * input_height;
            const int64_t output_plane  = output_width * output_height;
            const int64_t plane_count   = input.numel() / input_plane;
            auto x_contributors = make_interpolate_contributors(input_width, output_width, mode, antialias);
            auto y_contributors = make_interpolate_contributors(input_height, output_height, mode, antialias);
            for (int64_t plane = 0; plane < plane_count; ++plane) {
                const int64_t input_plane_offset  = plane * input_plane;
                const int64_t output_plane_offset = plane * output_plane;
                for (int64_t y = 0; y < output_height; ++y) {
                    const auto& y_axis = y_contributors[static_cast<size_t>(y)];
                    for (int64_t x = 0; x < output_width; ++x) {
                        const auto& x_axis = x_contributors[static_cast<size_t>(x)];
                        double value       = 0.0;
                        for (const auto& yc : y_axis) {
                            const int64_t input_row_offset = input_plane_offset + yc.index * input_width;
                            for (const auto& xc : x_axis) {
                                value += static_cast<double>(input.data()[input_row_offset + xc.index]) *
                                         xc.weight * yc.weight;
                            }
                        }
                        output.data()[output_plane_offset + y * output_width + x] = static_cast<T>(value);
                    }
                }
            }
            return output;
        }
        inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) {
            if (index < 0) {
                index += dim_size;
@ -1014,17 +1205,20 @@ namespace sd {
        inline Tensor<T> interpolate(const Tensor<T>& input,
                                     std::vector<int64_t> output_shape,
                                     InterpolateMode mode = InterpolateMode::Nearest,
-                                     bool align_corners   = false) {
+                                     bool align_corners   = false,
-            const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
+                                     bool antialias       = false) {
-                                               mode == InterpolateMode::NearestMax ||
+            const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode);
-                                               mode == InterpolateMode::NearestMin ||
+            const bool is_2d_filter_mode    = is_2d_filter_interpolate_mode(mode);
-                                               mode == InterpolateMode::NearestAvg);
+            if (!is_nearest_like_mode && !is_2d_filter_mode) {
-            if (!is_nearest_like_mode) {
+                tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
-                tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
+                                              std::to_string(static_cast<int>(mode)));
            }
            if (antialias && !is_2d_filter_mode) {
                tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
                                              std::to_string(static_cast<int>(mode)));
            }
            if (align_corners) {
-                tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
+                tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" +
                                              tensor_shape_to_string(input.shape()) + ", output_shape=" +
                                              tensor_shape_to_string(output_shape));
            }
@ -1051,6 +1245,10 @@ namespace sd {
                }
            }
            if (is_2d_filter_mode) {
                return interpolate_2d_filter(input, output_shape, mode, antialias);
            }
            bool has_downsampling = false;
            for (int64_t i = 0; i < input.dim(); ++i) {
                if (input.shape()[i] > output_shape[i]) {
@ -1060,12 +1258,20 @@ namespace sd {
            }
            Tensor<T> output(std::move(output_shape));
-            if (mode == InterpolateMode::Nearest || !has_downsampling) {
+            if (mode == InterpolateMode::Nearest ||
                mode == InterpolateMode::NearestExact ||
                !has_downsampling) {
                for (int64_t flat = 0; flat < output.numel(); ++flat) {
                    std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
                    std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
                    for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
-                        input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
+                        if (mode == InterpolateMode::NearestExact) {
                            input_coord[i] = nearest_exact_interpolate_index(output_coord[i],
                                                                             input.shape()[i],
                                                                             output.shape()[i]);
                        } else {
                            input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
                        }
                    }
                    output[flat] = input.index(input_coord);
                }
@ -1083,6 +1289,12 @@ namespace sd {
                        return T(0);
                    case InterpolateMode::Nearest:
                        return T(0);
                    case InterpolateMode::NearestExact:
                        return T(0);
                    case InterpolateMode::Bilinear:
                    case InterpolateMode::Bicubic:
                    case InterpolateMode::Lanczos:
                        break;
                }
                tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
@ -1102,6 +1314,12 @@ namespace sd {
                        break;
                    case InterpolateMode::Nearest:
                        break;
                    case InterpolateMode::NearestExact:
                        break;
                    case InterpolateMode::Bilinear:
                    case InterpolateMode::Bicubic:
                    case InterpolateMode::Lanczos:
                        break;
                }
            };
@ -1157,17 +1375,20 @@ namespace sd {
                                     const std::optional<std::vector<int64_t>>& size,
                                     const std::optional<std::vector<double>>& scale_factor,
                                     InterpolateMode mode = InterpolateMode::Nearest,
-                                     bool align_corners   = false) {
+                                     bool align_corners   = false,
-            const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
+                                     bool antialias       = false) {
-                                               mode == InterpolateMode::NearestMax ||
+            const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode);
-                                               mode == InterpolateMode::NearestMin ||
+            const bool is_2d_filter_mode    = is_2d_filter_interpolate_mode(mode);
-                                               mode == InterpolateMode::NearestAvg);
+            if (!is_nearest_like_mode && !is_2d_filter_mode) {
-            if (!is_nearest_like_mode) {
+                tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
-                tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
+                                              std::to_string(static_cast<int>(mode)));
            }
            if (antialias && !is_2d_filter_mode) {
                tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
                                              std::to_string(static_cast<int>(mode)));
            }
            if (align_corners) {
-                tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
+                tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" +
                                              tensor_shape_to_string(input.shape()));
            }
            if (size.has_value() == scale_factor.has_value()) {
@ -1211,7 +1432,7 @@ namespace sd {
                }
            }
-            return interpolate(input, std::move(output_shape), mode, align_corners);
+            return interpolate(input, std::move(output_shape), mode, align_corners, antialias);
        }
        template <typename T>
@ -1219,12 +1440,14 @@ namespace sd {
                                     const std::optional<std::vector<int64_t>>& size,
                                     double scale_factor,
                                     InterpolateMode mode = InterpolateMode::Nearest,
-                                     bool align_corners   = false) {
+                                     bool align_corners   = false,
                                     bool antialias       = false) {
            return interpolate(input,
                               size,
                               std::vector<double>(size.has_value() ? size->size() : input.dim(), scale_factor),
                               mode,
-                               align_corners);
+                               align_corners,
                               antialias);
        }
        template <typename T>