feat: add more built-in highres upscalers

This commit is contained in:
leejet 2026-04-23 21:58:34 +08:00
parent c97702e105
commit 53e4607ac8
11 changed files with 624 additions and 209 deletions

View File

@ -4,29 +4,29 @@
usage: ./bin/sd-cli [options] usage: ./bin/sd-cli [options]
CLI Options: CLI Options:
-o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image sequences (default: -o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image
./output.png) (eg. output_%03d.png). For video generation, single-file outputs support .avi, .webm, and animated .webp sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs
--preview-path <string> path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp support .avi, .webm, and animated .webp
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at --image <string> path to the image to inspect (for metadata mode)
every step) --metadata-format <string> metadata output format, one of [text, json] (default: text)
--output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise) --preview-path <string> path to write preview image to (default: ./preview.png). Multi-frame previews support
--image <string> path to the image to inspect (for metadata mode) .avi, .webm, and animated .webp
--metadata-format <string> metadata output format, one of [text, json] (default: text) --preview-interval <int> interval in denoising steps between consecutive updates of the image preview file
--canny apply canny preprocessor (edge detection) (default is 1, meaning updating at every step)
--convert-name convert tensor name (for convert mode) --output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified
convert mode writes `.gguf` or `.safetensors` based on the output extension. %d in output path, 1 otherwise)
`.safetensors` export currently supports f16, bf16, f32, and i32 tensor types only. --canny apply canny preprocessor (edge detection)
i32 is passthrough only; no f32 <-> i32 conversion is performed --convert-name convert tensor name (for convert mode)
-v, --verbose print extra info -v, --verbose print extra info
--color colors the logging tags according to level --color colors the logging tags according to level
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae) --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs --preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
--metadata-raw include raw hex previews for unparsed metadata payloads --metadata-raw include raw hex previews for unparsed metadata payloads
--metadata-brief truncate long metadata text values in text output --metadata-brief truncate long metadata text values in text output
--metadata-all include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments --metadata-all include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none) --preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
-h, --help show this help message and exit -h, --help show this help message and exit
Context Options: Context Options:
-m, --model <string> path to full model -m, --model <string> path to full model
@ -34,7 +34,8 @@ Context Options:
--clip_g <string> path to the clip-g text encoder --clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder --clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder --t5xxl <string> path to the t5xxl text encoder
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...) --llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
mistral-small3.2 for flux2, ...)
--llm_vision <string> path to the llm vit --llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated. --qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated. --qwen2vl_vision <string> alias of --llm_vision. Deprecated.
@ -46,16 +47,16 @@ Context Options:
--control-net <string> path to control net model --control-net <string> path to control net model
--embd-dir <string> embeddings directory --embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory --lora-model-dir <string> lora model directory
--hires-upscalers-dir <string> highres fix upscaler model directory
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model --photo-maker <string> path to PHOTOMAKER model
--upscale-model <string> path to esrgan model. --upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of -t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0,
CPU physical cores then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed
--mmap whether to memory-map model --mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram) --control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram)
@ -70,20 +71,19 @@ Context Options:
--chroma-disable-dit-mask disable dit mask for chroma --chroma-disable-dit-mask disable dit mask for chroma
--qwen-image-zero-cond-t enable zero_cond_t for qwen image --qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma --chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
type of the weight file q4_K). If not specified, the default is the type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng --sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow] --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights flux2_flow]
contain any quantized parameters, the at_runtime mode will be used; otherwise, --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is
immediately will be used.The immediately mode may have precision and auto. In auto mode, if the model weights contain any quantized parameters,
compatibility issues with quantized parameters, but it usually offers faster inference the at_runtime mode will be used; otherwise, immediately will be used.The
speed and, in some cases, lower memory usage. The at_runtime mode, on the immediately mode may have precision and compatibility issues with quantized
other hand, is exactly the opposite. parameters, but it usually offers faster inference speed and, in some cases,
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) lower memory usage. The at_runtime mode, on the other hand, is exactly the
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 opposite.
(overrides --vae-tile-size)
Generation Options: Generation Options:
-p, --prompt <string> the prompt to render -p, --prompt <string> the prompt to render
@ -92,69 +92,99 @@ Generation Options:
--end-img <string> path to the end image, required by flf2v --end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image --mask <string> path to the mask image
--control-image <string> path to control image, control net --control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in --control-video <string> path to control video frames, It must be a directory path. The video frames
lexicographical (character) order. For example, if the control video path is inside should be stored as images in lexicographical (character) order. For
`frames`, the directory contain images such as 00.png, 01.png, ... etc. example, if the control video path is `frames`, the directory contain images
such as 00.png, 01.png, ... etc.
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir --pm-id-images-dir <string> path to PHOTOMAKER input id images dir
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed --pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
-H, --height <int> image height, in pixel space (default: 512) -H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512) -W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20) --steps <int> number of sample steps (default: 20)
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto) --high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified, --clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
will be 1 for SD1.x, 2 for SD2.x (default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count -b, --batch-count <int> batch count
--video-frames <int> video frames (default: 1) --video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24) --fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for --timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for
NitroSD-Vibrant NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1) --upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128) --upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
--hires-width <int> highres fix target width, 0 to use --hires-scale (default: 0)
--hires-height <int> highres fix target height, 0 to use --hires-scale (default: 0)
--hires-steps <int> highres fix second pass sample steps, 0 to reuse --steps (default: 0)
--hires-upscale-tile-size <int> highres fix upscaler tile size, reserved for model-backed upscalers (default:
128)
--cfg-scale <float> unconditional guidance scale: (default: 7.0) --cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) --img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same
as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5) --guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 --slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
medium disabled, a value of 2.5 is nice for sd3.5 medium
--skip-layer-start <float> SLG enabling point (default: 0.01) --skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2) --skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a) --eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto) --flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0) --high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale) --high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5) (default: same as --cfg-scale)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0) --high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input
(default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
0)
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01) --high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2) --high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a) --high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--strength <float> strength for noising/unnoising (default: 0.75) --strength <float> strength for noising/unnoising (default: 0.75)
--pm-style-strength <float> --pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image --control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1 destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
`--high-noise-steps` is set to -1
--vace-strength <float> wan vace strength --vace-strength <float> wan vace strength
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). --vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--hires-scale <float> highres fix scale when target size is not set (default: 2.0)
--hires-denoising-strength <float> highres fix second pass denoising strength (default: 0.7)
--increase-ref-index automatically increase the indices of references images based on the order
they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images --disable-auto-resize-ref-image disable auto resize of ref images
--disable-image-metadata do not embed generation metadata on image files --disable-image-metadata do not embed generation metadata on image files
--vae-tiling process vae in tiles to reduce memory usage
--hires enable highres fix
-s, --seed RNG seed (default: 42, use random seed for < 0) -s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
otherwise) er_sde] (default: euler for Flux/SD3/Wan, euler_a otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
euler_a otherwise res_2s, er_sde] default: euler for Flux/SD3/Wan, euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
kl_optimal, lcm, bong_tangent], default: discrete smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default:
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0"). discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
"14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9]) --skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) -r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET),
'spectrum' (UNET/DiT Chebyshev+Taylor forecasting) 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache: --cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
"threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2" Examples: "threshold=0.25" or "threshold=1.5,reset=0"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
"1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static' --scm-policy SCM policy: 'dynamic' (default) or 'static'
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size
if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
``` ```
Metadata mode inspects PNG/JPEG container metadata without loading any model: Metadata mode inspects PNG/JPEG container metadata without loading any model:

View File

@ -690,7 +690,10 @@ int main(int argc, const char* argv[]) {
vae_decode_only = false; vae_decode_only = false;
} }
if (gen_params.hires_enabled && !gen_params.hires_upscaler_model_path.empty()) { if (gen_params.hires_enabled &&
(gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_MODEL ||
gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_LANCZOS ||
gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_NEAREST)) {
vae_decode_only = false; vae_decode_only = false;
} }

View File

@ -107,47 +107,60 @@ static bool is_absolute_path(const std::string& p) {
std::string ArgOptions::wrap_text(const std::string& text, size_t width, size_t indent) { std::string ArgOptions::wrap_text(const std::string& text, size_t width, size_t indent) {
std::ostringstream oss; std::ostringstream oss;
size_t line_len = 0;
size_t pos = 0; size_t pos = 0;
size_t line_len = 0;
while (pos < text.size()) { while (pos < text.size()) {
// Preserve manual newlines
if (text[pos] == '\n') { if (text[pos] == '\n') {
oss << '\n' oss << '\n'
<< std::string(indent, ' '); << std::string(indent, ' ');
line_len = indent; line_len = 0;
++pos; ++pos;
continue; continue;
} }
// Add the character if (std::isspace(static_cast<unsigned char>(text[pos]))) {
oss << text[pos]; ++pos;
++line_len; continue;
++pos; }
// If the current line exceeds width, try to break at the last space size_t word_start = pos;
if (line_len >= width) { while (pos < text.size() &&
std::string current = oss.str(); text[pos] != '\n' &&
size_t back = current.size(); !std::isspace(static_cast<unsigned char>(text[pos]))) {
++pos;
}
// Find the last space (for a clean break) std::string word = text.substr(word_start, pos - word_start);
while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n') while (!word.empty()) {
--back; size_t separator_len = line_len == 0 ? 0 : 1;
if (line_len + separator_len + word.size() <= width) {
// If found a space to break on if (separator_len > 0) {
if (back > 0 && current[back - 1] != '\n') { oss << ' ';
std::string before = current.substr(0, back - 1); ++line_len;
std::string after = current.substr(back); }
oss.str(""); oss << word;
oss.clear(); line_len += word.size();
oss << before << "\n" word.clear();
<< std::string(indent, ' ') << after; continue;
} else { }
// If no space found, just break at width
oss << "\n" if (line_len > 0) {
<< std::string(indent, ' '); oss << '\n'
<< std::string(indent, ' ');
line_len = 0;
continue;
}
size_t chunk_len = std::min(width, word.size());
oss << word.substr(0, chunk_len);
line_len = chunk_len;
word.erase(0, chunk_len);
if (!word.empty()) {
oss << '\n'
<< std::string(indent, ' ');
line_len = 0;
} }
line_len = indent;
} }
} }
@ -783,7 +796,9 @@ ArgOptions SDGenerationParams::get_options() {
&pm_id_embed_path}, &pm_id_embed_path},
{"", {"",
"--hires-upscaler", "--hires-upscaler",
"highres fix upscaler, Latent (nearest) or a model name/path under --hires-upscalers-dir (default: Latent (nearest))", "highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
"Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
"under --hires-upscalers-dir (default: Latent)",
&hires_upscaler}, &hires_upscaler},
}; };
@ -1918,7 +1933,7 @@ bool SDGenerationParams::resolve(const std::string& lora_model_dir, const std::s
hires_upscaler_model_path.clear(); hires_upscaler_model_path.clear();
if (hires_enabled) { if (hires_enabled) {
if (hires_upscaler.empty()) { if (hires_upscaler.empty()) {
hires_upscaler = "Latent (nearest)"; hires_upscaler = "Latent";
} }
resolved_hires_upscaler = str_to_sd_hires_upscaler(hires_upscaler.c_str()); resolved_hires_upscaler = str_to_sd_hires_upscaler(hires_upscaler.c_str());
if (resolved_hires_upscaler == SD_HIRES_UPSCALER_NONE) { if (resolved_hires_upscaler == SD_HIRES_UPSCALER_NONE) {

View File

@ -192,7 +192,7 @@ struct SDGenerationParams {
int upscale_tile_size = 128; int upscale_tile_size = 128;
bool hires_enabled = false; bool hires_enabled = false;
std::string hires_upscaler = "Latent (nearest)"; std::string hires_upscaler = "Latent";
std::string hires_upscaler_model_path; std::string hires_upscaler_model_path;
float hires_scale = 2.f; float hires_scale = 2.f;
int hires_width = 0; int hires_width = 0;

View File

@ -123,11 +123,11 @@ In this case, the server will load and serve the specified `index.html` file ins
usage: ./bin/sd-server [options] usage: ./bin/sd-server [options]
Svr Options: Svr Options:
-l, --listen-ip <string> server listen ip (default: 127.0.0.1) -l, --listen-ip <string> server listen ip (default: 127.0.0.1)
--serve-html-path <string> path to HTML file to serve at root (optional) --serve-html-path <string> path to HTML file to serve at root (optional)
--listen-port <int> server listen port (default: 1234) --listen-port <int> server listen port (default: 1234)
-v, --verbose print extra info -v, --verbose print extra info
--color colors the logging tags according to level --color colors the logging tags according to level
-h, --help show this help message and exit -h, --help show this help message and exit
Context Options: Context Options:
@ -136,7 +136,8 @@ Context Options:
--clip_g <string> path to the clip-g text encoder --clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder --clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder --t5xxl <string> path to the t5xxl text encoder
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...) --llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
mistral-small3.2 for flux2, ...)
--llm_vision <string> path to the llm vit --llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated. --qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated. --qwen2vl_vision <string> alias of --llm_vision. Deprecated.
@ -148,16 +149,16 @@ Context Options:
--control-net <string> path to control net model --control-net <string> path to control net model
--embd-dir <string> embeddings directory --embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory --lora-model-dir <string> lora model directory
--hires-upscalers-dir <string> highres fix upscaler model directory
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model --photo-maker <string> path to PHOTOMAKER model
--upscale-model <string> path to esrgan model. --upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of -t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0,
CPU physical cores then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed
--mmap whether to memory-map model --mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram) --control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram)
@ -172,20 +173,19 @@ Context Options:
--chroma-disable-dit-mask disable dit mask for chroma --chroma-disable-dit-mask disable dit mask for chroma
--qwen-image-zero-cond-t enable zero_cond_t for qwen image --qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma --chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
type of the weight file q4_K). If not specified, the default is the type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng --sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow] --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights flux2_flow]
contain any quantized parameters, the at_runtime mode will be used; otherwise, --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is
immediately will be used.The immediately mode may have precision and auto. In auto mode, if the model weights contain any quantized parameters,
compatibility issues with quantized parameters, but it usually offers faster inference the at_runtime mode will be used; otherwise, immediately will be used.The
speed and, in some cases, lower memory usage. The at_runtime mode, on the immediately mode may have precision and compatibility issues with quantized
other hand, is exactly the opposite. parameters, but it usually offers faster inference speed and, in some cases,
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) lower memory usage. The at_runtime mode, on the other hand, is exactly the
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 opposite.
(overrides --vae-tile-size)
Default Generation Options: Default Generation Options:
-p, --prompt <string> the prompt to render -p, --prompt <string> the prompt to render
@ -194,65 +194,97 @@ Default Generation Options:
--end-img <string> path to the end image, required by flf2v --end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image --mask <string> path to the mask image
--control-image <string> path to control image, control net --control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in --control-video <string> path to control video frames, It must be a directory path. The video frames
lexicographical (character) order. For example, if the control video path is inside should be stored as images in lexicographical (character) order. For
`frames`, the directory contain images such as 00.png, 01.png, ... etc. example, if the control video path is `frames`, the directory contain images
such as 00.png, 01.png, ... etc.
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir --pm-id-images-dir <string> path to PHOTOMAKER input id images dir
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed --pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
-H, --height <int> image height, in pixel space (default: 512) -H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512) -W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20) --steps <int> number of sample steps (default: 20)
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto) --high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified, --clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
will be 1 for SD1.x, 2 for SD2.x (default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count -b, --batch-count <int> batch count
--video-frames <int> video frames (default: 1) --video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24) --fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for --timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for
NitroSD-Vibrant NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1) --upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128) --upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
--hires-width <int> highres fix target width, 0 to use --hires-scale (default: 0)
--hires-height <int> highres fix target height, 0 to use --hires-scale (default: 0)
--hires-steps <int> highres fix second pass sample steps, 0 to reuse --steps (default: 0)
--hires-upscale-tile-size <int> highres fix upscaler tile size, reserved for model-backed upscalers (default:
128)
--cfg-scale <float> unconditional guidance scale: (default: 7.0) --cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) --img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same
as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5) --guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 --slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
medium disabled, a value of 2.5 is nice for sd3.5 medium
--skip-layer-start <float> SLG enabling point (default: 0.01) --skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2) --skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a) --eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto) --flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0) --high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale) --high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5) (default: same as --cfg-scale)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0) --high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input
(default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
0)
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01) --high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2) --high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a) --high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--strength <float> strength for noising/unnoising (default: 0.75) --strength <float> strength for noising/unnoising (default: 0.75)
--pm-style-strength <float> --pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image --control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1 destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
`--high-noise-steps` is set to -1
--vace-strength <float> wan vace strength --vace-strength <float> wan vace strength
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). --vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--hires-scale <float> highres fix scale when target size is not set (default: 2.0)
--hires-denoising-strength <float> highres fix second pass denoising strength (default: 0.7)
--increase-ref-index automatically increase the indices of references images based on the order
they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images --disable-auto-resize-ref-image disable auto resize of ref images
--disable-image-metadata do not embed generation metadata on image files --disable-image-metadata do not embed generation metadata on image files
--vae-tiling process vae in tiles to reduce memory usage
--hires enable highres fix
-s, --seed RNG seed (default: 42, use random seed for < 0) -s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
otherwise) er_sde] (default: euler for Flux/SD3/Wan, euler_a otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
euler_a otherwise res_2s, er_sde] default: euler for Flux/SD3/Wan, euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
kl_optimal, lcm, bong_tangent], default: discrete smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default:
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0"). discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
"14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9]) --skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) -r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting) --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET),
'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache: --cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
"threshold=0.25" or "threshold=1.5,reset=0" Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache Examples: "threshold=0.25" or "threshold=1.5,reset=0"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
"1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static' --scm-policy SCM policy: 'dynamic' (default) or 'static'
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size
if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
``` ```

View File

@ -219,7 +219,7 @@ Currently supported request fields:
| `lora` | `array<object>` | Structured LoRA list | | `lora` | `array<object>` | Structured LoRA list |
| `extra_images` | `array<string>` | Base64 or data URL images | | `extra_images` | `array<string>` | Base64 or data URL images |
| `enable_hr` | `boolean` | Enable highres fix for `txt2img` | | `enable_hr` | `boolean` | Enable highres fix for `txt2img` |
| `hr_upscaler` | `string` | `Latent (nearest)` or an upscaler model name from `/sdapi/v1/upscalers` | | `hr_upscaler` | `string` | `Lanczos`, `Nearest`, a latent mode such as `Latent (nearest-exact)`, or an upscaler model name from `/sdapi/v1/upscalers` |
| `hr_scale` | `number` | Highres scale when resize target is not set | | `hr_scale` | `number` | Highres scale when resize target is not set |
| `hr_resize_x` | `integer` | Highres target width, `0` to use scale | | `hr_resize_x` | `integer` | Highres target width, `0` to use scale |
| `hr_resize_y` | `integer` | Highres target height, `0` to use scale | | `hr_resize_y` | `integer` | Highres target height, `0` to use scale |
@ -303,6 +303,8 @@ Built-in entries include `None`, `Lanczos`, and `Nearest`. Model-backed entries
| --- | --- | --- | | --- | --- | --- |
| `[].name` | `string` | WebUI-compatible latent upscale mode name | | `[].name` | `string` | WebUI-compatible latent upscale mode name |
Built-in latent modes include `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`.
`GET /sdapi/v1/samplers` `GET /sdapi/v1/samplers`
| Field | Type | Notes | | Field | Type | Notes |
@ -462,7 +464,7 @@ Shared nested fields:
| --- | --- | --- | | --- | --- | --- |
| `upscalers[].name` | `string` | Built-in name or model stem; use this value in `hires.upscaler` | | `upscalers[].name` | `string` | Built-in name or model stem; use this value in `hires.upscaler` |
Built-in entries include `None` and `Latent (nearest)`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned. Built-in entries include `None`, `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned.
`limits` `limits`
@ -677,7 +679,7 @@ Example:
"lora": [], "lora": [],
"hires": { "hires": {
"enabled": false, "enabled": false,
"upscaler": "Latent (nearest)", "upscaler": "Latent",
"scale": 2.0, "scale": 2.0,
"target_width": 0, "target_width": 0,
"target_height": 0, "target_height": 0,
@ -804,7 +806,7 @@ Other native fields:
| `scm_mask` | `string` | | `scm_mask` | `string` |
| `scm_policy_dynamic` | `boolean` | | `scm_policy_dynamic` | `boolean` |
For `hires.upscaler`, use `Latent (nearest)` for latent upscale or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory. For `hires.upscaler`, use `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, `Latent (bicubic antialiased)`, or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory.
HTTP-only output fields: HTTP-only output fields:

View File

@ -381,6 +381,8 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
json result = json::array(); json result = json::array();
result.push_back(make_builtin("None")); result.push_back(make_builtin("None"));
result.push_back(make_builtin("Lanczos"));
result.push_back(make_builtin("Nearest"));
{ {
std::lock_guard<std::mutex> lock(*runtime->upscaler_mutex); std::lock_guard<std::mutex> lock(*runtime->upscaler_mutex);
@ -400,7 +402,12 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
svr.Get("/sdapi/v1/latent-upscale-modes", [](const httplib::Request&, httplib::Response& res) { svr.Get("/sdapi/v1/latent-upscale-modes", [](const httplib::Request&, httplib::Response& res) {
json result = json::array({ json result = json::array({
{{"name", "Latent"}},
{{"name", "Latent (nearest)"}}, {{"name", "Latent (nearest)"}},
{{"name", "Latent (nearest-exact)"}},
{{"name", "Latent (antialiased)"}},
{{"name", "Latent (bicubic)"}},
{{"name", "Latent (bicubic antialiased)"}},
}); });
res.set_content(result.dump(), "application/json"); res.set_content(result.dump(), "application/json");
}); });

View File

@ -227,9 +227,30 @@ static json make_capabilities_json(ServerRuntime& runtime) {
available_upscalers.push_back({ available_upscalers.push_back({
{"name", "None"}, {"name", "None"},
}); });
available_upscalers.push_back({
{"name", "Lanczos"},
});
available_upscalers.push_back({
{"name", "Nearest"},
});
available_upscalers.push_back({
{"name", "Latent"},
});
available_upscalers.push_back({ available_upscalers.push_back({
{"name", "Latent (nearest)"}, {"name", "Latent (nearest)"},
}); });
available_upscalers.push_back({
{"name", "Latent (nearest-exact)"},
});
available_upscalers.push_back({
{"name", "Latent (antialiased)"},
});
available_upscalers.push_back({
{"name", "Latent (bicubic)"},
});
available_upscalers.push_back({
{"name", "Latent (bicubic antialiased)"},
});
{ {
std::lock_guard<std::mutex> lock(*runtime.upscaler_mutex); std::lock_guard<std::mutex> lock(*runtime.upscaler_mutex);
for (const auto& entry : *runtime.upscaler_cache) { for (const auto& entry : *runtime.upscaler_cache) {

View File

@ -291,7 +291,14 @@ typedef struct {
enum sd_hires_upscaler_t { enum sd_hires_upscaler_t {
SD_HIRES_UPSCALER_NONE, SD_HIRES_UPSCALER_NONE,
SD_HIRES_UPSCALER_LATENT,
SD_HIRES_UPSCALER_LATENT_NEAREST, SD_HIRES_UPSCALER_LATENT_NEAREST,
SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT,
SD_HIRES_UPSCALER_LATENT_ANTIALIASED,
SD_HIRES_UPSCALER_LATENT_BICUBIC,
SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED,
SD_HIRES_UPSCALER_LANCZOS,
SD_HIRES_UPSCALER_NEAREST,
SD_HIRES_UPSCALER_MODEL, SD_HIRES_UPSCALER_MODEL,
SD_HIRES_UPSCALER_COUNT, SD_HIRES_UPSCALER_COUNT,
}; };

View File

@ -2116,12 +2116,19 @@ enum lora_apply_mode_t str_to_lora_apply_mode(const char* str) {
const char* hires_upscaler_to_str[] = { const char* hires_upscaler_to_str[] = {
"None", "None",
"Latent",
"Latent (nearest)", "Latent (nearest)",
"Latent (nearest-exact)",
"Latent (antialiased)",
"Latent (bicubic)",
"Latent (bicubic antialiased)",
"Lanczos",
"Nearest",
"Model", "Model",
}; };
const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler) { const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler) {
if (upscaler < SD_HIRES_UPSCALER_COUNT) { if (upscaler >= SD_HIRES_UPSCALER_NONE && upscaler < SD_HIRES_UPSCALER_COUNT) {
return hires_upscaler_to_str[upscaler]; return hires_upscaler_to_str[upscaler];
} }
return NONE_STR; return NONE_STR;
@ -2167,7 +2174,7 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) {
void sd_hires_params_init(sd_hires_params_t* hires_params) { void sd_hires_params_init(sd_hires_params_t* hires_params) {
*hires_params = {}; *hires_params = {};
hires_params->enabled = false; hires_params->enabled = false;
hires_params->upscaler = SD_HIRES_UPSCALER_LATENT_NEAREST; hires_params->upscaler = SD_HIRES_UPSCALER_LATENT;
hires_params->model_path = nullptr; hires_params->model_path = nullptr;
hires_params->scale = 2.0f; hires_params->scale = 2.0f;
hires_params->target_width = 0; hires_params->target_width = 0;
@ -2658,7 +2665,7 @@ struct GenerationRequest {
hires.enabled = false; hires.enabled = false;
return; return;
} }
if (hires.upscaler < SD_HIRES_UPSCALER_NONE && hires.upscaler >= SD_HIRES_UPSCALER_COUNT) { if (hires.upscaler < SD_HIRES_UPSCALER_NONE || hires.upscaler >= SD_HIRES_UPSCALER_COUNT) {
LOG_WARN("hires upscaler '%d' is invalid, disabling hires", hires.upscaler); LOG_WARN("hires upscaler '%d' is invalid, disabling hires", hires.upscaler);
hires.enabled = false; hires.enabled = false;
return; return;
@ -3252,55 +3259,123 @@ static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
const sd::Tensor<float>& latent, const sd::Tensor<float>& latent,
const GenerationRequest& request, const GenerationRequest& request,
UpscalerGGML* upscaler) { UpscalerGGML* upscaler) {
if (request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST) { auto get_hires_latent_target_shape = [&]() {
std::vector<int64_t> target_shape = latent.shape(); std::vector<int64_t> target_shape = latent.shape();
if (target_shape.size() < 2) { if (target_shape.size() < 2) {
LOG_ERROR("latent has invalid shape for hires upscale"); target_shape.clear();
return {}; return target_shape;
} }
target_shape[0] = request.hires.target_width / request.vae_scale_factor; target_shape[0] = request.hires.target_width / request.vae_scale_factor;
target_shape[1] = request.hires.target_height / request.vae_scale_factor; target_shape[1] = request.hires.target_height / request.vae_scale_factor;
return target_shape;
};
LOG_INFO("hires latent upscale %" PRId64 "x%" PRId64 " -> %" PRId64 "x%" PRId64, if (request.hires.upscaler == SD_HIRES_UPSCALER_LATENT ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_ANTIALIASED ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_BICUBIC ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED) {
std::vector<int64_t> target_shape = get_hires_latent_target_shape();
if (target_shape.empty()) {
LOG_ERROR("latent has invalid shape for hires upscale");
return {};
}
sd::ops::InterpolateMode mode = sd::ops::InterpolateMode::Nearest;
bool antialias = false;
switch (request.hires.upscaler) {
case SD_HIRES_UPSCALER_LATENT:
mode = sd::ops::InterpolateMode::Bilinear;
break;
case SD_HIRES_UPSCALER_LATENT_NEAREST:
mode = sd::ops::InterpolateMode::Nearest;
break;
case SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT:
mode = sd::ops::InterpolateMode::NearestExact;
break;
case SD_HIRES_UPSCALER_LATENT_ANTIALIASED:
mode = sd::ops::InterpolateMode::Bilinear;
antialias = true;
break;
case SD_HIRES_UPSCALER_LATENT_BICUBIC:
mode = sd::ops::InterpolateMode::Bicubic;
break;
case SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED:
mode = sd::ops::InterpolateMode::Bicubic;
antialias = true;
break;
default:
break;
}
LOG_INFO("hires %s upscale %" PRId64 "x%" PRId64 " -> %" PRId64 "x%" PRId64,
sd_hires_upscaler_name(request.hires.upscaler),
latent.shape()[0], latent.shape()[0],
latent.shape()[1], latent.shape()[1],
target_shape[0], target_shape[0],
target_shape[1]); target_shape[1]);
return sd::ops::interpolate(latent, target_shape, sd::ops::InterpolateMode::Nearest);
} else if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) { return sd::ops::interpolate(latent, target_shape, mode, false, antialias);
if (upscaler == nullptr) { } else if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL ||
LOG_ERROR("hires model upscaler context is null"); request.hires.upscaler == SD_HIRES_UPSCALER_LANCZOS ||
request.hires.upscaler == SD_HIRES_UPSCALER_NEAREST) {
if (sd_ctx->sd->vae_decode_only) {
LOG_ERROR("hires %s upscaler requires VAE encoder weights; create the context with vae_decode_only=false",
sd_hires_upscaler_name(request.hires.upscaler));
return {}; return {};
} }
if (sd_ctx->sd->vae_decode_only) { if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL && upscaler == nullptr) {
LOG_ERROR("hires model upscaler requires VAE encoder weights; create the context with vae_decode_only=false"); LOG_ERROR("hires model upscaler context is null");
return {}; return {};
} }
sd::Tensor<float> decoded = sd_ctx->sd->decode_first_stage(latent); sd::Tensor<float> decoded = sd_ctx->sd->decode_first_stage(latent);
if (decoded.empty()) { if (decoded.empty()) {
LOG_ERROR("decode_first_stage failed before hires model upscale"); LOG_ERROR("decode_first_stage failed before hires %s upscale",
sd_hires_upscaler_name(request.hires.upscaler));
return {}; return {};
} }
sd::Tensor<float> upscaled_tensor = upscaler->upscale_tensor(decoded); sd::Tensor<float> upscaled_tensor;
if (upscaled_tensor.empty()) { if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
LOG_ERROR("hires model upscale failed"); upscaled_tensor = upscaler->upscale_tensor(decoded);
return {}; if (upscaled_tensor.empty()) {
} LOG_ERROR("hires model upscale failed");
return {};
}
if (upscaled_tensor.shape()[0] != request.hires.target_width || if (upscaled_tensor.shape()[0] != request.hires.target_width ||
upscaled_tensor.shape()[1] != request.hires.target_height) { upscaled_tensor.shape()[1] != request.hires.target_height) {
upscaled_tensor = sd::ops::interpolate(upscaled_tensor, upscaled_tensor = sd::ops::interpolate(upscaled_tensor,
{request.hires.target_width,
request.hires.target_height,
upscaled_tensor.shape()[2],
upscaled_tensor.shape()[3]});
}
} else {
sd::ops::InterpolateMode mode = request.hires.upscaler == SD_HIRES_UPSCALER_LANCZOS
? sd::ops::InterpolateMode::Lanczos
: sd::ops::InterpolateMode::Nearest;
LOG_INFO("hires %s image upscale %" PRId64 "x%" PRId64 " -> %dx%d",
sd_hires_upscaler_name(request.hires.upscaler),
decoded.shape()[0],
decoded.shape()[1],
request.hires.target_width,
request.hires.target_height);
upscaled_tensor = sd::ops::interpolate(decoded,
{request.hires.target_width, {request.hires.target_width,
request.hires.target_height, request.hires.target_height,
upscaled_tensor.shape()[2], decoded.shape()[2],
upscaled_tensor.shape()[3]}); decoded.shape()[3]},
mode);
upscaled_tensor = sd::ops::clamp(upscaled_tensor, 0.0f, 1.0f);
} }
sd::Tensor<float> upscaled_latent = sd_ctx->sd->encode_first_stage(upscaled_tensor); sd::Tensor<float> upscaled_latent = sd_ctx->sd->encode_first_stage(upscaled_tensor);
if (upscaled_latent.empty()) { if (upscaled_latent.empty()) {
LOG_ERROR("encode_first_stage failed after hires model upscale"); LOG_ERROR("encode_first_stage failed after hires %s upscale",
sd_hires_upscaler_name(request.hires.upscaler));
} }
return upscaled_latent; return upscaled_latent;
} }

View File

@ -815,11 +815,202 @@ namespace sd {
namespace ops { namespace ops {
enum class InterpolateMode { enum class InterpolateMode {
Nearest, Nearest,
NearestExact,
NearestMax, NearestMax,
NearestMin, NearestMin,
NearestAvg, NearestAvg,
Bilinear,
Bicubic,
Lanczos,
}; };
inline bool is_nearest_like_interpolate_mode(InterpolateMode mode) {
return mode == InterpolateMode::Nearest ||
mode == InterpolateMode::NearestExact ||
mode == InterpolateMode::NearestMax ||
mode == InterpolateMode::NearestMin ||
mode == InterpolateMode::NearestAvg;
}
inline bool is_2d_filter_interpolate_mode(InterpolateMode mode) {
return mode == InterpolateMode::Bilinear ||
mode == InterpolateMode::Bicubic ||
mode == InterpolateMode::Lanczos;
}
inline int64_t nearest_exact_interpolate_index(int64_t output_index,
int64_t input_size,
int64_t output_size) {
const double scale = static_cast<double>(input_size) / static_cast<double>(output_size);
const double center = (static_cast<double>(output_index) + 0.5) * scale - 0.5;
return std::min(std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0), input_size - 1);
}
inline double linear_interpolate_weight(double x) {
x = std::abs(x);
return x < 1.0 ? 1.0 - x : 0.0;
}
inline double cubic_interpolate_weight(double x) {
constexpr double a = -0.75; // Match PyTorch bicubic interpolation.
x = std::abs(x);
if (x <= 1.0) {
return ((a + 2.0) * x - (a + 3.0)) * x * x + 1.0;
}
if (x < 2.0) {
return ((a * x - 5.0 * a) * x + 8.0 * a) * x - 4.0 * a;
}
return 0.0;
}
inline double sinc(double x) {
constexpr double pi = 3.14159265358979323846;
if (std::abs(x) < 1e-12) {
return 1.0;
}
const double pix = pi * x;
return std::sin(pix) / pix;
}
inline double lanczos_interpolate_weight(double x) {
constexpr double radius = 3.0;
x = std::abs(x);
if (x >= radius) {
return 0.0;
}
return sinc(x) * sinc(x / radius);
}
struct InterpolateContributor {
int64_t index;
double weight;
};
inline std::vector<std::vector<InterpolateContributor>> make_interpolate_contributors(
int64_t input_size,
int64_t output_size,
InterpolateMode mode,
bool antialias) {
std::vector<std::vector<InterpolateContributor>> contributors(static_cast<size_t>(output_size));
const double scale = static_cast<double>(input_size) / static_cast<double>(output_size);
const double filter_scale = antialias ? std::max(1.0, scale) : 1.0;
for (int64_t out = 0; out < output_size; ++out) {
const double center = (static_cast<double>(out) + 0.5) * scale - 0.5;
int64_t start = 0;
int64_t end = 0;
if (mode == InterpolateMode::Bilinear) {
const double support = filter_scale;
start = static_cast<int64_t>(std::ceil(center - support));
end = static_cast<int64_t>(std::floor(center + support));
} else if (mode == InterpolateMode::Bicubic) {
const double support = 2.0 * filter_scale;
start = static_cast<int64_t>(std::ceil(center - support));
end = static_cast<int64_t>(std::floor(center + support));
} else if (mode == InterpolateMode::Lanczos) {
const double support = 3.0 * filter_scale;
start = static_cast<int64_t>(std::ceil(center - support));
end = static_cast<int64_t>(std::floor(center + support));
} else {
tensor_throw_invalid_argument("Unsupported 2D filter interpolate mode: mode=" +
std::to_string(static_cast<int>(mode)));
}
double weight_sum = 0.0;
std::vector<InterpolateContributor>& axis_contributors = contributors[static_cast<size_t>(out)];
axis_contributors.reserve(static_cast<size_t>(end - start + 1));
for (int64_t in = start; in <= end; ++in) {
double weight = 0.0;
if (mode == InterpolateMode::Bilinear) {
weight = linear_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
} else if (mode == InterpolateMode::Bicubic) {
weight = cubic_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
} else {
weight = lanczos_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
}
if (weight == 0.0) {
continue;
}
const int64_t clamped_index = std::min(std::max<int64_t>(in, 0), input_size - 1);
axis_contributors.push_back({clamped_index, weight});
weight_sum += weight;
}
if ((antialias || mode == InterpolateMode::Lanczos) &&
std::abs(weight_sum) > 1e-12) {
for (auto& contributor : axis_contributors) {
contributor.weight /= weight_sum;
}
}
if (axis_contributors.empty()) {
const int64_t nearest = std::min(
std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0),
input_size - 1);
axis_contributors.push_back({nearest, 1.0});
}
}
return contributors;
}
template <typename T>
inline Tensor<T> interpolate_2d_filter(const Tensor<T>& input,
const std::vector<int64_t>& output_shape,
InterpolateMode mode,
bool antialias) {
if (input.dim() < 2) {
tensor_throw_invalid_argument("2D filter interpolate requires rank >= 2: input_shape=" +
tensor_shape_to_string(input.shape()) + ", output_shape=" +
tensor_shape_to_string(output_shape));
}
for (size_t i = 2; i < output_shape.size(); ++i) {
if (input.shape()[i] != output_shape[i]) {
tensor_throw_invalid_argument("2D filter interpolate only supports resizing dimensions 0 and 1: input_shape=" +
tensor_shape_to_string(input.shape()) + ", output_shape=" +
tensor_shape_to_string(output_shape));
}
}
Tensor<T> output(output_shape);
const int64_t input_width = input.shape()[0];
const int64_t input_height = input.shape()[1];
const int64_t output_width = output_shape[0];
const int64_t output_height = output_shape[1];
const int64_t input_plane = input_width * input_height;
const int64_t output_plane = output_width * output_height;
const int64_t plane_count = input.numel() / input_plane;
auto x_contributors = make_interpolate_contributors(input_width, output_width, mode, antialias);
auto y_contributors = make_interpolate_contributors(input_height, output_height, mode, antialias);
for (int64_t plane = 0; plane < plane_count; ++plane) {
const int64_t input_plane_offset = plane * input_plane;
const int64_t output_plane_offset = plane * output_plane;
for (int64_t y = 0; y < output_height; ++y) {
const auto& y_axis = y_contributors[static_cast<size_t>(y)];
for (int64_t x = 0; x < output_width; ++x) {
const auto& x_axis = x_contributors[static_cast<size_t>(x)];
double value = 0.0;
for (const auto& yc : y_axis) {
const int64_t input_row_offset = input_plane_offset + yc.index * input_width;
for (const auto& xc : x_axis) {
value += static_cast<double>(input.data()[input_row_offset + xc.index]) *
xc.weight * yc.weight;
}
}
output.data()[output_plane_offset + y * output_width + x] = static_cast<T>(value);
}
}
}
return output;
}
inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) { inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) {
if (index < 0) { if (index < 0) {
index += dim_size; index += dim_size;
@ -1014,17 +1205,20 @@ namespace sd {
inline Tensor<T> interpolate(const Tensor<T>& input, inline Tensor<T> interpolate(const Tensor<T>& input,
std::vector<int64_t> output_shape, std::vector<int64_t> output_shape,
InterpolateMode mode = InterpolateMode::Nearest, InterpolateMode mode = InterpolateMode::Nearest,
bool align_corners = false) { bool align_corners = false,
const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest || bool antialias = false) {
mode == InterpolateMode::NearestMax || const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode);
mode == InterpolateMode::NearestMin || const bool is_2d_filter_mode = is_2d_filter_interpolate_mode(mode);
mode == InterpolateMode::NearestAvg); if (!is_nearest_like_mode && !is_2d_filter_mode) {
if (!is_nearest_like_mode) { tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" + std::to_string(static_cast<int>(mode)));
}
if (antialias && !is_2d_filter_mode) {
tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
std::to_string(static_cast<int>(mode))); std::to_string(static_cast<int>(mode)));
} }
if (align_corners) { if (align_corners) {
tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" + tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" +
tensor_shape_to_string(input.shape()) + ", output_shape=" + tensor_shape_to_string(input.shape()) + ", output_shape=" +
tensor_shape_to_string(output_shape)); tensor_shape_to_string(output_shape));
} }
@ -1051,6 +1245,10 @@ namespace sd {
} }
} }
if (is_2d_filter_mode) {
return interpolate_2d_filter(input, output_shape, mode, antialias);
}
bool has_downsampling = false; bool has_downsampling = false;
for (int64_t i = 0; i < input.dim(); ++i) { for (int64_t i = 0; i < input.dim(); ++i) {
if (input.shape()[i] > output_shape[i]) { if (input.shape()[i] > output_shape[i]) {
@ -1060,12 +1258,20 @@ namespace sd {
} }
Tensor<T> output(std::move(output_shape)); Tensor<T> output(std::move(output_shape));
if (mode == InterpolateMode::Nearest || !has_downsampling) { if (mode == InterpolateMode::Nearest ||
mode == InterpolateMode::NearestExact ||
!has_downsampling) {
for (int64_t flat = 0; flat < output.numel(); ++flat) { for (int64_t flat = 0; flat < output.numel(); ++flat) {
std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape()); std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0); std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) { for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i]; if (mode == InterpolateMode::NearestExact) {
input_coord[i] = nearest_exact_interpolate_index(output_coord[i],
input.shape()[i],
output.shape()[i]);
} else {
input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
}
} }
output[flat] = input.index(input_coord); output[flat] = input.index(input_coord);
} }
@ -1083,6 +1289,12 @@ namespace sd {
return T(0); return T(0);
case InterpolateMode::Nearest: case InterpolateMode::Nearest:
return T(0); return T(0);
case InterpolateMode::NearestExact:
return T(0);
case InterpolateMode::Bilinear:
case InterpolateMode::Bicubic:
case InterpolateMode::Lanczos:
break;
} }
tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" + tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
@ -1102,6 +1314,12 @@ namespace sd {
break; break;
case InterpolateMode::Nearest: case InterpolateMode::Nearest:
break; break;
case InterpolateMode::NearestExact:
break;
case InterpolateMode::Bilinear:
case InterpolateMode::Bicubic:
case InterpolateMode::Lanczos:
break;
} }
}; };
@ -1157,17 +1375,20 @@ namespace sd {
const std::optional<std::vector<int64_t>>& size, const std::optional<std::vector<int64_t>>& size,
const std::optional<std::vector<double>>& scale_factor, const std::optional<std::vector<double>>& scale_factor,
InterpolateMode mode = InterpolateMode::Nearest, InterpolateMode mode = InterpolateMode::Nearest,
bool align_corners = false) { bool align_corners = false,
const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest || bool antialias = false) {
mode == InterpolateMode::NearestMax || const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode);
mode == InterpolateMode::NearestMin || const bool is_2d_filter_mode = is_2d_filter_interpolate_mode(mode);
mode == InterpolateMode::NearestAvg); if (!is_nearest_like_mode && !is_2d_filter_mode) {
if (!is_nearest_like_mode) { tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" + std::to_string(static_cast<int>(mode)));
}
if (antialias && !is_2d_filter_mode) {
tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
std::to_string(static_cast<int>(mode))); std::to_string(static_cast<int>(mode)));
} }
if (align_corners) { if (align_corners) {
tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" + tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" +
tensor_shape_to_string(input.shape())); tensor_shape_to_string(input.shape()));
} }
if (size.has_value() == scale_factor.has_value()) { if (size.has_value() == scale_factor.has_value()) {
@ -1211,7 +1432,7 @@ namespace sd {
} }
} }
return interpolate(input, std::move(output_shape), mode, align_corners); return interpolate(input, std::move(output_shape), mode, align_corners, antialias);
} }
template <typename T> template <typename T>
@ -1219,12 +1440,14 @@ namespace sd {
const std::optional<std::vector<int64_t>>& size, const std::optional<std::vector<int64_t>>& size,
double scale_factor, double scale_factor,
InterpolateMode mode = InterpolateMode::Nearest, InterpolateMode mode = InterpolateMode::Nearest,
bool align_corners = false) { bool align_corners = false,
bool antialias = false) {
return interpolate(input, return interpolate(input,
size, size,
std::vector<double>(size.has_value() ? size->size() : input.dim(), scale_factor), std::vector<double>(size.has_value() ? size->size() : input.dim(), scale_factor),
mode, mode,
align_corners); align_corners,
antialias);
} }
template <typename T> template <typename T>