mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-05-08 16:28:53 +00:00
feat: add more built-in highres upscalers (#1456)
This commit is contained in:
parent
c97702e105
commit
b8bdffc199
@ -4,29 +4,29 @@
|
||||
usage: ./bin/sd-cli [options]
|
||||
|
||||
CLI Options:
|
||||
-o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
|
||||
./output.png) (eg. output_%03d.png). For video generation, single-file outputs support .avi, .webm, and animated .webp
|
||||
--preview-path <string> path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp
|
||||
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
|
||||
every step)
|
||||
--output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
|
||||
--image <string> path to the image to inspect (for metadata mode)
|
||||
--metadata-format <string> metadata output format, one of [text, json] (default: text)
|
||||
--canny apply canny preprocessor (edge detection)
|
||||
--convert-name convert tensor name (for convert mode)
|
||||
convert mode writes `.gguf` or `.safetensors` based on the output extension.
|
||||
`.safetensors` export currently supports f16, bf16, f32, and i32 tensor types only.
|
||||
i32 is passthrough only; no f32 <-> i32 conversion is performed
|
||||
-v, --verbose print extra info
|
||||
--color colors the logging tags according to level
|
||||
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
|
||||
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
|
||||
--metadata-raw include raw hex previews for unparsed metadata payloads
|
||||
--metadata-brief truncate long metadata text values in text output
|
||||
--metadata-all include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments
|
||||
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen
|
||||
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
|
||||
-h, --help show this help message and exit
|
||||
-o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image
|
||||
sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs
|
||||
support .avi, .webm, and animated .webp
|
||||
--image <string> path to the image to inspect (for metadata mode)
|
||||
--metadata-format <string> metadata output format, one of [text, json] (default: text)
|
||||
--preview-path <string> path to write preview image to (default: ./preview.png). Multi-frame previews support
|
||||
.avi, .webm, and animated .webp
|
||||
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file
|
||||
(default is 1, meaning updating at every step)
|
||||
--output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified
|
||||
%d in output path, 1 otherwise)
|
||||
--canny apply canny preprocessor (edge detection)
|
||||
--convert-name convert tensor name (for convert mode)
|
||||
-v, --verbose print extra info
|
||||
--color colors the logging tags according to level
|
||||
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
|
||||
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
|
||||
--metadata-raw include raw hex previews for unparsed metadata payloads
|
||||
--metadata-brief truncate long metadata text values in text output
|
||||
--metadata-all include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments
|
||||
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen
|
||||
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
|
||||
-h, --help show this help message and exit
|
||||
|
||||
Context Options:
|
||||
-m, --model <string> path to full model
|
||||
@ -34,7 +34,8 @@ Context Options:
|
||||
--clip_g <string> path to the clip-g text encoder
|
||||
--clip_vision <string> path to the clip-vision encoder
|
||||
--t5xxl <string> path to the t5xxl text encoder
|
||||
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
|
||||
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
|
||||
mistral-small3.2 for flux2, ...)
|
||||
--llm_vision <string> path to the llm vit
|
||||
--qwen2vl <string> alias of --llm. Deprecated.
|
||||
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
|
||||
@ -46,16 +47,16 @@ Context Options:
|
||||
--control-net <string> path to control net model
|
||||
--embd-dir <string> embeddings directory
|
||||
--lora-model-dir <string> lora model directory
|
||||
--hires-upscalers-dir <string> highres fix upscaler model directory
|
||||
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
|
||||
--photo-maker <string> path to PHOTOMAKER model
|
||||
--upscale-model <string> path to esrgan model.
|
||||
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
|
||||
CPU physical cores
|
||||
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0,
|
||||
then threads will be set to the number of CPU physical cores
|
||||
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
|
||||
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
|
||||
--vae-tiling process vae in tiles to reduce memory usage
|
||||
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
|
||||
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
|
||||
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
|
||||
when needed
|
||||
--mmap whether to memory-map model
|
||||
--control-net-cpu keep controlnet in cpu (for low vram)
|
||||
--clip-on-cpu keep clip in cpu (for low vram)
|
||||
@ -70,20 +71,19 @@ Context Options:
|
||||
--chroma-disable-dit-mask disable dit mask for chroma
|
||||
--qwen-image-zero-cond-t enable zero_cond_t for qwen image
|
||||
--chroma-enable-t5-mask enable t5 mask for chroma
|
||||
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
|
||||
type of the weight file
|
||||
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
|
||||
q4_K). If not specified, the default is the type of the weight file
|
||||
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
|
||||
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
|
||||
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
|
||||
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
|
||||
contain any quantized parameters, the at_runtime mode will be used; otherwise,
|
||||
immediately will be used.The immediately mode may have precision and
|
||||
compatibility issues with quantized parameters, but it usually offers faster inference
|
||||
speed and, in some cases, lower memory usage. The at_runtime mode, on the
|
||||
other hand, is exactly the opposite.
|
||||
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
|
||||
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
|
||||
(overrides --vae-tile-size)
|
||||
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
|
||||
flux2_flow]
|
||||
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is
|
||||
auto. In auto mode, if the model weights contain any quantized parameters,
|
||||
the at_runtime mode will be used; otherwise, immediately will be used.The
|
||||
immediately mode may have precision and compatibility issues with quantized
|
||||
parameters, but it usually offers faster inference speed and, in some cases,
|
||||
lower memory usage. The at_runtime mode, on the other hand, is exactly the
|
||||
opposite.
|
||||
|
||||
Generation Options:
|
||||
-p, --prompt <string> the prompt to render
|
||||
@ -92,69 +92,99 @@ Generation Options:
|
||||
--end-img <string> path to the end image, required by flf2v
|
||||
--mask <string> path to the mask image
|
||||
--control-image <string> path to control image, control net
|
||||
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
|
||||
lexicographical (character) order. For example, if the control video path is
|
||||
`frames`, the directory contain images such as 00.png, 01.png, ... etc.
|
||||
--control-video <string> path to control video frames, It must be a directory path. The video frames
|
||||
inside should be stored as images in lexicographical (character) order. For
|
||||
example, if the control video path is `frames`, the directory contain images
|
||||
such as 00.png, 01.png, ... etc.
|
||||
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir
|
||||
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
|
||||
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
|
||||
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
|
||||
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
|
||||
-H, --height <int> image height, in pixel space (default: 512)
|
||||
-W, --width <int> image width, in pixel space (default: 512)
|
||||
--steps <int> number of sample steps (default: 20)
|
||||
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
|
||||
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
|
||||
will be 1 for SD1.x, 2 for SD2.x
|
||||
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
|
||||
(default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
|
||||
-b, --batch-count <int> batch count
|
||||
--video-frames <int> video frames (default: 1)
|
||||
--fps <int> fps (default: 24)
|
||||
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
|
||||
NitroSD-Vibrant
|
||||
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for
|
||||
NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
|
||||
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
|
||||
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
|
||||
--hires-width <int> highres fix target width, 0 to use --hires-scale (default: 0)
|
||||
--hires-height <int> highres fix target height, 0 to use --hires-scale (default: 0)
|
||||
--hires-steps <int> highres fix second pass sample steps, 0 to reuse --steps (default: 0)
|
||||
--hires-upscale-tile-size <int> highres fix upscaler tile size, reserved for model-backed upscalers (default:
|
||||
128)
|
||||
--cfg-scale <float> unconditional guidance scale: (default: 7.0)
|
||||
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
|
||||
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same
|
||||
as --cfg-scale)
|
||||
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
|
||||
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
|
||||
medium
|
||||
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
|
||||
disabled, a value of 2.5 is nice for sd3.5 medium
|
||||
--skip-layer-start <float> SLG enabling point (default: 0.01)
|
||||
--skip-layer-end <float> SLG disabling point (default: 0.2)
|
||||
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
|
||||
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
|
||||
res_2s; 1 for euler_a, er_sde and dpm++2s_a)
|
||||
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
|
||||
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
|
||||
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
|
||||
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
|
||||
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
|
||||
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models
|
||||
(default: same as --cfg-scale)
|
||||
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input
|
||||
(default: 3.5)
|
||||
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
|
||||
0)
|
||||
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
|
||||
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
|
||||
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
|
||||
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
|
||||
res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
|
||||
--strength <float> strength for noising/unnoising (default: 0.75)
|
||||
--pm-style-strength <float>
|
||||
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
|
||||
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
|
||||
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full
|
||||
destruction of information in init image
|
||||
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
|
||||
`--high-noise-steps` is set to -1
|
||||
--vace-strength <float> wan vace strength
|
||||
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
|
||||
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
|
||||
--hires-scale <float> highres fix scale when target size is not set (default: 2.0)
|
||||
--hires-denoising-strength <float> highres fix second pass denoising strength (default: 0.7)
|
||||
--increase-ref-index automatically increase the indices of references images based on the order
|
||||
they are listed (starting with 1).
|
||||
--disable-auto-resize-ref-image disable auto resize of ref images
|
||||
--disable-image-metadata do not embed generation metadata on image files
|
||||
--vae-tiling process vae in tiles to reduce memory usage
|
||||
--hires enable highres fix
|
||||
-s, --seed RNG seed (default: 42, use random seed for < 0)
|
||||
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
|
||||
tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a
|
||||
otherwise)
|
||||
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
|
||||
ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan,
|
||||
euler_a otherwise
|
||||
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
|
||||
kl_optimal, lcm, bong_tangent], default: discrete
|
||||
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
|
||||
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
|
||||
dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
|
||||
er_sde] (default: euler for Flux/SD3/Wan, euler_a otherwise)
|
||||
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
|
||||
dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
|
||||
res_2s, er_sde] default: euler for Flux/SD3/Wan, euler_a otherwise
|
||||
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
|
||||
smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default:
|
||||
discrete
|
||||
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
|
||||
"14.61,7.8,3.5,0.0").
|
||||
--skip-layers layers to skip for SLG steps (default: [7,8,9])
|
||||
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
|
||||
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
|
||||
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
|
||||
'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
|
||||
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET),
|
||||
'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
|
||||
Chebyshev+Taylor forecasting)
|
||||
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
|
||||
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
|
||||
spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
|
||||
"threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
|
||||
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
|
||||
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
|
||||
Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
|
||||
Examples: "threshold=0.25" or "threshold=1.5,reset=0"
|
||||
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
|
||||
"1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
|
||||
--scm-policy SCM policy: 'dynamic' (default) or 'static'
|
||||
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
|
||||
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size
|
||||
if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
|
||||
```
|
||||
|
||||
Metadata mode inspects PNG/JPEG container metadata without loading any model:
|
||||
|
||||
@ -690,7 +690,10 @@ int main(int argc, const char* argv[]) {
|
||||
vae_decode_only = false;
|
||||
}
|
||||
|
||||
if (gen_params.hires_enabled && !gen_params.hires_upscaler_model_path.empty()) {
|
||||
if (gen_params.hires_enabled &&
|
||||
(gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_MODEL ||
|
||||
gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_LANCZOS ||
|
||||
gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_NEAREST)) {
|
||||
vae_decode_only = false;
|
||||
}
|
||||
|
||||
|
||||
@ -107,47 +107,60 @@ static bool is_absolute_path(const std::string& p) {
|
||||
|
||||
std::string ArgOptions::wrap_text(const std::string& text, size_t width, size_t indent) {
|
||||
std::ostringstream oss;
|
||||
size_t line_len = 0;
|
||||
size_t pos = 0;
|
||||
size_t line_len = 0;
|
||||
|
||||
while (pos < text.size()) {
|
||||
// Preserve manual newlines
|
||||
if (text[pos] == '\n') {
|
||||
oss << '\n'
|
||||
<< std::string(indent, ' ');
|
||||
line_len = indent;
|
||||
line_len = 0;
|
||||
++pos;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add the character
|
||||
oss << text[pos];
|
||||
++line_len;
|
||||
++pos;
|
||||
if (std::isspace(static_cast<unsigned char>(text[pos]))) {
|
||||
++pos;
|
||||
continue;
|
||||
}
|
||||
|
||||
// If the current line exceeds width, try to break at the last space
|
||||
if (line_len >= width) {
|
||||
std::string current = oss.str();
|
||||
size_t back = current.size();
|
||||
size_t word_start = pos;
|
||||
while (pos < text.size() &&
|
||||
text[pos] != '\n' &&
|
||||
!std::isspace(static_cast<unsigned char>(text[pos]))) {
|
||||
++pos;
|
||||
}
|
||||
|
||||
// Find the last space (for a clean break)
|
||||
while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n')
|
||||
--back;
|
||||
|
||||
// If found a space to break on
|
||||
if (back > 0 && current[back - 1] != '\n') {
|
||||
std::string before = current.substr(0, back - 1);
|
||||
std::string after = current.substr(back);
|
||||
oss.str("");
|
||||
oss.clear();
|
||||
oss << before << "\n"
|
||||
<< std::string(indent, ' ') << after;
|
||||
} else {
|
||||
// If no space found, just break at width
|
||||
oss << "\n"
|
||||
<< std::string(indent, ' ');
|
||||
std::string word = text.substr(word_start, pos - word_start);
|
||||
while (!word.empty()) {
|
||||
size_t separator_len = line_len == 0 ? 0 : 1;
|
||||
if (line_len + separator_len + word.size() <= width) {
|
||||
if (separator_len > 0) {
|
||||
oss << ' ';
|
||||
++line_len;
|
||||
}
|
||||
oss << word;
|
||||
line_len += word.size();
|
||||
word.clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line_len > 0) {
|
||||
oss << '\n'
|
||||
<< std::string(indent, ' ');
|
||||
line_len = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t chunk_len = std::min(width, word.size());
|
||||
oss << word.substr(0, chunk_len);
|
||||
line_len = chunk_len;
|
||||
word.erase(0, chunk_len);
|
||||
if (!word.empty()) {
|
||||
oss << '\n'
|
||||
<< std::string(indent, ' ');
|
||||
line_len = 0;
|
||||
}
|
||||
line_len = indent;
|
||||
}
|
||||
}
|
||||
|
||||
@ -783,7 +796,9 @@ ArgOptions SDGenerationParams::get_options() {
|
||||
&pm_id_embed_path},
|
||||
{"",
|
||||
"--hires-upscaler",
|
||||
"highres fix upscaler, Latent (nearest) or a model name/path under --hires-upscalers-dir (default: Latent (nearest))",
|
||||
"highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
|
||||
"Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
|
||||
"under --hires-upscalers-dir (default: Latent)",
|
||||
&hires_upscaler},
|
||||
};
|
||||
|
||||
@ -1918,7 +1933,7 @@ bool SDGenerationParams::resolve(const std::string& lora_model_dir, const std::s
|
||||
hires_upscaler_model_path.clear();
|
||||
if (hires_enabled) {
|
||||
if (hires_upscaler.empty()) {
|
||||
hires_upscaler = "Latent (nearest)";
|
||||
hires_upscaler = "Latent";
|
||||
}
|
||||
resolved_hires_upscaler = str_to_sd_hires_upscaler(hires_upscaler.c_str());
|
||||
if (resolved_hires_upscaler == SD_HIRES_UPSCALER_NONE) {
|
||||
|
||||
@ -192,7 +192,7 @@ struct SDGenerationParams {
|
||||
int upscale_tile_size = 128;
|
||||
|
||||
bool hires_enabled = false;
|
||||
std::string hires_upscaler = "Latent (nearest)";
|
||||
std::string hires_upscaler = "Latent";
|
||||
std::string hires_upscaler_model_path;
|
||||
float hires_scale = 2.f;
|
||||
int hires_width = 0;
|
||||
|
||||
@ -136,7 +136,8 @@ Context Options:
|
||||
--clip_g <string> path to the clip-g text encoder
|
||||
--clip_vision <string> path to the clip-vision encoder
|
||||
--t5xxl <string> path to the t5xxl text encoder
|
||||
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
|
||||
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
|
||||
mistral-small3.2 for flux2, ...)
|
||||
--llm_vision <string> path to the llm vit
|
||||
--qwen2vl <string> alias of --llm. Deprecated.
|
||||
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
|
||||
@ -148,16 +149,16 @@ Context Options:
|
||||
--control-net <string> path to control net model
|
||||
--embd-dir <string> embeddings directory
|
||||
--lora-model-dir <string> lora model directory
|
||||
--hires-upscalers-dir <string> highres fix upscaler model directory
|
||||
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
|
||||
--photo-maker <string> path to PHOTOMAKER model
|
||||
--upscale-model <string> path to esrgan model.
|
||||
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
|
||||
CPU physical cores
|
||||
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0,
|
||||
then threads will be set to the number of CPU physical cores
|
||||
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
|
||||
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
|
||||
--vae-tiling process vae in tiles to reduce memory usage
|
||||
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
|
||||
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
|
||||
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
|
||||
when needed
|
||||
--mmap whether to memory-map model
|
||||
--control-net-cpu keep controlnet in cpu (for low vram)
|
||||
--clip-on-cpu keep clip in cpu (for low vram)
|
||||
@ -172,20 +173,19 @@ Context Options:
|
||||
--chroma-disable-dit-mask disable dit mask for chroma
|
||||
--qwen-image-zero-cond-t enable zero_cond_t for qwen image
|
||||
--chroma-enable-t5-mask enable t5 mask for chroma
|
||||
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
|
||||
type of the weight file
|
||||
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
|
||||
q4_K). If not specified, the default is the type of the weight file
|
||||
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
|
||||
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
|
||||
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
|
||||
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
|
||||
contain any quantized parameters, the at_runtime mode will be used; otherwise,
|
||||
immediately will be used.The immediately mode may have precision and
|
||||
compatibility issues with quantized parameters, but it usually offers faster inference
|
||||
speed and, in some cases, lower memory usage. The at_runtime mode, on the
|
||||
other hand, is exactly the opposite.
|
||||
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
|
||||
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
|
||||
(overrides --vae-tile-size)
|
||||
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
|
||||
flux2_flow]
|
||||
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is
|
||||
auto. In auto mode, if the model weights contain any quantized parameters,
|
||||
the at_runtime mode will be used; otherwise, immediately will be used.The
|
||||
immediately mode may have precision and compatibility issues with quantized
|
||||
parameters, but it usually offers faster inference speed and, in some cases,
|
||||
lower memory usage. The at_runtime mode, on the other hand, is exactly the
|
||||
opposite.
|
||||
|
||||
Default Generation Options:
|
||||
-p, --prompt <string> the prompt to render
|
||||
@ -194,65 +194,97 @@ Default Generation Options:
|
||||
--end-img <string> path to the end image, required by flf2v
|
||||
--mask <string> path to the mask image
|
||||
--control-image <string> path to control image, control net
|
||||
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
|
||||
lexicographical (character) order. For example, if the control video path is
|
||||
`frames`, the directory contain images such as 00.png, 01.png, ... etc.
|
||||
--control-video <string> path to control video frames, It must be a directory path. The video frames
|
||||
inside should be stored as images in lexicographical (character) order. For
|
||||
example, if the control video path is `frames`, the directory contain images
|
||||
such as 00.png, 01.png, ... etc.
|
||||
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir
|
||||
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
|
||||
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
|
||||
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
|
||||
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
|
||||
-H, --height <int> image height, in pixel space (default: 512)
|
||||
-W, --width <int> image width, in pixel space (default: 512)
|
||||
--steps <int> number of sample steps (default: 20)
|
||||
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
|
||||
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
|
||||
will be 1 for SD1.x, 2 for SD2.x
|
||||
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
|
||||
(default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
|
||||
-b, --batch-count <int> batch count
|
||||
--video-frames <int> video frames (default: 1)
|
||||
--fps <int> fps (default: 24)
|
||||
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
|
||||
NitroSD-Vibrant
|
||||
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for
|
||||
NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
|
||||
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
|
||||
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
|
||||
--hires-width <int> highres fix target width, 0 to use --hires-scale (default: 0)
|
||||
--hires-height <int> highres fix target height, 0 to use --hires-scale (default: 0)
|
||||
--hires-steps <int> highres fix second pass sample steps, 0 to reuse --steps (default: 0)
|
||||
--hires-upscale-tile-size <int> highres fix upscaler tile size, reserved for model-backed upscalers (default:
|
||||
128)
|
||||
--cfg-scale <float> unconditional guidance scale: (default: 7.0)
|
||||
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
|
||||
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same
|
||||
as --cfg-scale)
|
||||
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
|
||||
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
|
||||
medium
|
||||
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
|
||||
disabled, a value of 2.5 is nice for sd3.5 medium
|
||||
--skip-layer-start <float> SLG enabling point (default: 0.01)
|
||||
--skip-layer-end <float> SLG disabling point (default: 0.2)
|
||||
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
|
||||
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
|
||||
res_2s; 1 for euler_a, er_sde and dpm++2s_a)
|
||||
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
|
||||
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
|
||||
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
|
||||
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
|
||||
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
|
||||
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models
|
||||
(default: same as --cfg-scale)
|
||||
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input
|
||||
(default: 3.5)
|
||||
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
|
||||
0)
|
||||
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
|
||||
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
|
||||
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
|
||||
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
|
||||
res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
|
||||
--strength <float> strength for noising/unnoising (default: 0.75)
|
||||
--pm-style-strength <float>
|
||||
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
|
||||
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
|
||||
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full
|
||||
destruction of information in init image
|
||||
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
|
||||
`--high-noise-steps` is set to -1
|
||||
--vace-strength <float> wan vace strength
|
||||
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
|
||||
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
|
||||
--hires-scale <float> highres fix scale when target size is not set (default: 2.0)
|
||||
--hires-denoising-strength <float> highres fix second pass denoising strength (default: 0.7)
|
||||
--increase-ref-index automatically increase the indices of references images based on the order
|
||||
they are listed (starting with 1).
|
||||
--disable-auto-resize-ref-image disable auto resize of ref images
|
||||
--disable-image-metadata do not embed generation metadata on image files
|
||||
--vae-tiling process vae in tiles to reduce memory usage
|
||||
--hires enable highres fix
|
||||
-s, --seed RNG seed (default: 42, use random seed for < 0)
|
||||
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
|
||||
tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a
|
||||
otherwise)
|
||||
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
|
||||
ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan,
|
||||
euler_a otherwise
|
||||
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
|
||||
kl_optimal, lcm, bong_tangent], default: discrete
|
||||
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
|
||||
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
|
||||
dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
|
||||
er_sde] (default: euler for Flux/SD3/Wan, euler_a otherwise)
|
||||
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
|
||||
dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
|
||||
res_2s, er_sde] default: euler for Flux/SD3/Wan, euler_a otherwise
|
||||
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
|
||||
smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default:
|
||||
discrete
|
||||
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
|
||||
"14.61,7.8,3.5,0.0").
|
||||
--skip-layers layers to skip for SLG steps (default: [7,8,9])
|
||||
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
|
||||
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
|
||||
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
|
||||
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET),
|
||||
'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
|
||||
Chebyshev+Taylor forecasting)
|
||||
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
|
||||
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
|
||||
"threshold=0.25" or "threshold=1.5,reset=0"
|
||||
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
|
||||
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
|
||||
Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
|
||||
Examples: "threshold=0.25" or "threshold=1.5,reset=0"
|
||||
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
|
||||
"1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
|
||||
--scm-policy SCM policy: 'dynamic' (default) or 'static'
|
||||
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
|
||||
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size
|
||||
if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
|
||||
```
|
||||
|
||||
@ -219,7 +219,7 @@ Currently supported request fields:
|
||||
| `lora` | `array<object>` | Structured LoRA list |
|
||||
| `extra_images` | `array<string>` | Base64 or data URL images |
|
||||
| `enable_hr` | `boolean` | Enable highres fix for `txt2img` |
|
||||
| `hr_upscaler` | `string` | `Latent (nearest)` or an upscaler model name from `/sdapi/v1/upscalers` |
|
||||
| `hr_upscaler` | `string` | `Lanczos`, `Nearest`, a latent mode such as `Latent (nearest-exact)`, or an upscaler model name from `/sdapi/v1/upscalers` |
|
||||
| `hr_scale` | `number` | Highres scale when resize target is not set |
|
||||
| `hr_resize_x` | `integer` | Highres target width, `0` to use scale |
|
||||
| `hr_resize_y` | `integer` | Highres target height, `0` to use scale |
|
||||
@ -303,6 +303,8 @@ Built-in entries include `None`, `Lanczos`, and `Nearest`. Model-backed entries
|
||||
| --- | --- | --- |
|
||||
| `[].name` | `string` | WebUI-compatible latent upscale mode name |
|
||||
|
||||
Built-in latent modes include `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`.
|
||||
|
||||
`GET /sdapi/v1/samplers`
|
||||
|
||||
| Field | Type | Notes |
|
||||
@ -462,7 +464,7 @@ Shared nested fields:
|
||||
| --- | --- | --- |
|
||||
| `upscalers[].name` | `string` | Built-in name or model stem; use this value in `hires.upscaler` |
|
||||
|
||||
Built-in entries include `None` and `Latent (nearest)`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned.
|
||||
Built-in entries include `None`, `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned.
|
||||
|
||||
`limits`
|
||||
|
||||
@ -677,7 +679,7 @@ Example:
|
||||
"lora": [],
|
||||
"hires": {
|
||||
"enabled": false,
|
||||
"upscaler": "Latent (nearest)",
|
||||
"upscaler": "Latent",
|
||||
"scale": 2.0,
|
||||
"target_width": 0,
|
||||
"target_height": 0,
|
||||
@ -804,7 +806,7 @@ Other native fields:
|
||||
| `scm_mask` | `string` |
|
||||
| `scm_policy_dynamic` | `boolean` |
|
||||
|
||||
For `hires.upscaler`, use `Latent (nearest)` for latent upscale or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory.
|
||||
For `hires.upscaler`, use `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, `Latent (bicubic antialiased)`, or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory.
|
||||
|
||||
HTTP-only output fields:
|
||||
|
||||
|
||||
@ -381,6 +381,8 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
|
||||
|
||||
json result = json::array();
|
||||
result.push_back(make_builtin("None"));
|
||||
result.push_back(make_builtin("Lanczos"));
|
||||
result.push_back(make_builtin("Nearest"));
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(*runtime->upscaler_mutex);
|
||||
@ -400,7 +402,12 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
|
||||
|
||||
svr.Get("/sdapi/v1/latent-upscale-modes", [](const httplib::Request&, httplib::Response& res) {
|
||||
json result = json::array({
|
||||
{{"name", "Latent"}},
|
||||
{{"name", "Latent (nearest)"}},
|
||||
{{"name", "Latent (nearest-exact)"}},
|
||||
{{"name", "Latent (antialiased)"}},
|
||||
{{"name", "Latent (bicubic)"}},
|
||||
{{"name", "Latent (bicubic antialiased)"}},
|
||||
});
|
||||
res.set_content(result.dump(), "application/json");
|
||||
});
|
||||
|
||||
@ -227,9 +227,30 @@ static json make_capabilities_json(ServerRuntime& runtime) {
|
||||
available_upscalers.push_back({
|
||||
{"name", "None"},
|
||||
});
|
||||
available_upscalers.push_back({
|
||||
{"name", "Lanczos"},
|
||||
});
|
||||
available_upscalers.push_back({
|
||||
{"name", "Nearest"},
|
||||
});
|
||||
available_upscalers.push_back({
|
||||
{"name", "Latent"},
|
||||
});
|
||||
available_upscalers.push_back({
|
||||
{"name", "Latent (nearest)"},
|
||||
});
|
||||
available_upscalers.push_back({
|
||||
{"name", "Latent (nearest-exact)"},
|
||||
});
|
||||
available_upscalers.push_back({
|
||||
{"name", "Latent (antialiased)"},
|
||||
});
|
||||
available_upscalers.push_back({
|
||||
{"name", "Latent (bicubic)"},
|
||||
});
|
||||
available_upscalers.push_back({
|
||||
{"name", "Latent (bicubic antialiased)"},
|
||||
});
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(*runtime.upscaler_mutex);
|
||||
for (const auto& entry : *runtime.upscaler_cache) {
|
||||
|
||||
@ -291,7 +291,14 @@ typedef struct {
|
||||
|
||||
enum sd_hires_upscaler_t {
|
||||
SD_HIRES_UPSCALER_NONE,
|
||||
SD_HIRES_UPSCALER_LATENT,
|
||||
SD_HIRES_UPSCALER_LATENT_NEAREST,
|
||||
SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT,
|
||||
SD_HIRES_UPSCALER_LATENT_ANTIALIASED,
|
||||
SD_HIRES_UPSCALER_LATENT_BICUBIC,
|
||||
SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED,
|
||||
SD_HIRES_UPSCALER_LANCZOS,
|
||||
SD_HIRES_UPSCALER_NEAREST,
|
||||
SD_HIRES_UPSCALER_MODEL,
|
||||
SD_HIRES_UPSCALER_COUNT,
|
||||
};
|
||||
|
||||
@ -2116,12 +2116,19 @@ enum lora_apply_mode_t str_to_lora_apply_mode(const char* str) {
|
||||
|
||||
const char* hires_upscaler_to_str[] = {
|
||||
"None",
|
||||
"Latent",
|
||||
"Latent (nearest)",
|
||||
"Latent (nearest-exact)",
|
||||
"Latent (antialiased)",
|
||||
"Latent (bicubic)",
|
||||
"Latent (bicubic antialiased)",
|
||||
"Lanczos",
|
||||
"Nearest",
|
||||
"Model",
|
||||
};
|
||||
|
||||
const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler) {
|
||||
if (upscaler < SD_HIRES_UPSCALER_COUNT) {
|
||||
if (upscaler >= SD_HIRES_UPSCALER_NONE && upscaler < SD_HIRES_UPSCALER_COUNT) {
|
||||
return hires_upscaler_to_str[upscaler];
|
||||
}
|
||||
return NONE_STR;
|
||||
@ -2167,7 +2174,7 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) {
|
||||
void sd_hires_params_init(sd_hires_params_t* hires_params) {
|
||||
*hires_params = {};
|
||||
hires_params->enabled = false;
|
||||
hires_params->upscaler = SD_HIRES_UPSCALER_LATENT_NEAREST;
|
||||
hires_params->upscaler = SD_HIRES_UPSCALER_LATENT;
|
||||
hires_params->model_path = nullptr;
|
||||
hires_params->scale = 2.0f;
|
||||
hires_params->target_width = 0;
|
||||
@ -2658,7 +2665,7 @@ struct GenerationRequest {
|
||||
hires.enabled = false;
|
||||
return;
|
||||
}
|
||||
if (hires.upscaler < SD_HIRES_UPSCALER_NONE && hires.upscaler >= SD_HIRES_UPSCALER_COUNT) {
|
||||
if (hires.upscaler < SD_HIRES_UPSCALER_NONE || hires.upscaler >= SD_HIRES_UPSCALER_COUNT) {
|
||||
LOG_WARN("hires upscaler '%d' is invalid, disabling hires", hires.upscaler);
|
||||
hires.enabled = false;
|
||||
return;
|
||||
@ -3252,55 +3259,123 @@ static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
|
||||
const sd::Tensor<float>& latent,
|
||||
const GenerationRequest& request,
|
||||
UpscalerGGML* upscaler) {
|
||||
if (request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST) {
|
||||
auto get_hires_latent_target_shape = [&]() {
|
||||
std::vector<int64_t> target_shape = latent.shape();
|
||||
if (target_shape.size() < 2) {
|
||||
LOG_ERROR("latent has invalid shape for hires upscale");
|
||||
return {};
|
||||
target_shape.clear();
|
||||
return target_shape;
|
||||
}
|
||||
target_shape[0] = request.hires.target_width / request.vae_scale_factor;
|
||||
target_shape[1] = request.hires.target_height / request.vae_scale_factor;
|
||||
return target_shape;
|
||||
};
|
||||
|
||||
LOG_INFO("hires latent upscale %" PRId64 "x%" PRId64 " -> %" PRId64 "x%" PRId64,
|
||||
if (request.hires.upscaler == SD_HIRES_UPSCALER_LATENT ||
|
||||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST ||
|
||||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT ||
|
||||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_ANTIALIASED ||
|
||||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_BICUBIC ||
|
||||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED) {
|
||||
std::vector<int64_t> target_shape = get_hires_latent_target_shape();
|
||||
if (target_shape.empty()) {
|
||||
LOG_ERROR("latent has invalid shape for hires upscale");
|
||||
return {};
|
||||
}
|
||||
|
||||
sd::ops::InterpolateMode mode = sd::ops::InterpolateMode::Nearest;
|
||||
bool antialias = false;
|
||||
switch (request.hires.upscaler) {
|
||||
case SD_HIRES_UPSCALER_LATENT:
|
||||
mode = sd::ops::InterpolateMode::Bilinear;
|
||||
break;
|
||||
case SD_HIRES_UPSCALER_LATENT_NEAREST:
|
||||
mode = sd::ops::InterpolateMode::Nearest;
|
||||
break;
|
||||
case SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT:
|
||||
mode = sd::ops::InterpolateMode::NearestExact;
|
||||
break;
|
||||
case SD_HIRES_UPSCALER_LATENT_ANTIALIASED:
|
||||
mode = sd::ops::InterpolateMode::Bilinear;
|
||||
antialias = true;
|
||||
break;
|
||||
case SD_HIRES_UPSCALER_LATENT_BICUBIC:
|
||||
mode = sd::ops::InterpolateMode::Bicubic;
|
||||
break;
|
||||
case SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED:
|
||||
mode = sd::ops::InterpolateMode::Bicubic;
|
||||
antialias = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
LOG_INFO("hires %s upscale %" PRId64 "x%" PRId64 " -> %" PRId64 "x%" PRId64,
|
||||
sd_hires_upscaler_name(request.hires.upscaler),
|
||||
latent.shape()[0],
|
||||
latent.shape()[1],
|
||||
target_shape[0],
|
||||
target_shape[1]);
|
||||
return sd::ops::interpolate(latent, target_shape, sd::ops::InterpolateMode::Nearest);
|
||||
} else if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
|
||||
if (upscaler == nullptr) {
|
||||
LOG_ERROR("hires model upscaler context is null");
|
||||
|
||||
return sd::ops::interpolate(latent, target_shape, mode, false, antialias);
|
||||
} else if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL ||
|
||||
request.hires.upscaler == SD_HIRES_UPSCALER_LANCZOS ||
|
||||
request.hires.upscaler == SD_HIRES_UPSCALER_NEAREST) {
|
||||
if (sd_ctx->sd->vae_decode_only) {
|
||||
LOG_ERROR("hires %s upscaler requires VAE encoder weights; create the context with vae_decode_only=false",
|
||||
sd_hires_upscaler_name(request.hires.upscaler));
|
||||
return {};
|
||||
}
|
||||
if (sd_ctx->sd->vae_decode_only) {
|
||||
LOG_ERROR("hires model upscaler requires VAE encoder weights; create the context with vae_decode_only=false");
|
||||
if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL && upscaler == nullptr) {
|
||||
LOG_ERROR("hires model upscaler context is null");
|
||||
return {};
|
||||
}
|
||||
|
||||
sd::Tensor<float> decoded = sd_ctx->sd->decode_first_stage(latent);
|
||||
if (decoded.empty()) {
|
||||
LOG_ERROR("decode_first_stage failed before hires model upscale");
|
||||
LOG_ERROR("decode_first_stage failed before hires %s upscale",
|
||||
sd_hires_upscaler_name(request.hires.upscaler));
|
||||
return {};
|
||||
}
|
||||
|
||||
sd::Tensor<float> upscaled_tensor = upscaler->upscale_tensor(decoded);
|
||||
if (upscaled_tensor.empty()) {
|
||||
LOG_ERROR("hires model upscale failed");
|
||||
return {};
|
||||
}
|
||||
sd::Tensor<float> upscaled_tensor;
|
||||
if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
|
||||
upscaled_tensor = upscaler->upscale_tensor(decoded);
|
||||
if (upscaled_tensor.empty()) {
|
||||
LOG_ERROR("hires model upscale failed");
|
||||
return {};
|
||||
}
|
||||
|
||||
if (upscaled_tensor.shape()[0] != request.hires.target_width ||
|
||||
upscaled_tensor.shape()[1] != request.hires.target_height) {
|
||||
upscaled_tensor = sd::ops::interpolate(upscaled_tensor,
|
||||
if (upscaled_tensor.shape()[0] != request.hires.target_width ||
|
||||
upscaled_tensor.shape()[1] != request.hires.target_height) {
|
||||
upscaled_tensor = sd::ops::interpolate(upscaled_tensor,
|
||||
{request.hires.target_width,
|
||||
request.hires.target_height,
|
||||
upscaled_tensor.shape()[2],
|
||||
upscaled_tensor.shape()[3]});
|
||||
}
|
||||
} else {
|
||||
sd::ops::InterpolateMode mode = request.hires.upscaler == SD_HIRES_UPSCALER_LANCZOS
|
||||
? sd::ops::InterpolateMode::Lanczos
|
||||
: sd::ops::InterpolateMode::Nearest;
|
||||
LOG_INFO("hires %s image upscale %" PRId64 "x%" PRId64 " -> %dx%d",
|
||||
sd_hires_upscaler_name(request.hires.upscaler),
|
||||
decoded.shape()[0],
|
||||
decoded.shape()[1],
|
||||
request.hires.target_width,
|
||||
request.hires.target_height);
|
||||
upscaled_tensor = sd::ops::interpolate(decoded,
|
||||
{request.hires.target_width,
|
||||
request.hires.target_height,
|
||||
upscaled_tensor.shape()[2],
|
||||
upscaled_tensor.shape()[3]});
|
||||
decoded.shape()[2],
|
||||
decoded.shape()[3]},
|
||||
mode);
|
||||
upscaled_tensor = sd::ops::clamp(upscaled_tensor, 0.0f, 1.0f);
|
||||
}
|
||||
|
||||
sd::Tensor<float> upscaled_latent = sd_ctx->sd->encode_first_stage(upscaled_tensor);
|
||||
if (upscaled_latent.empty()) {
|
||||
LOG_ERROR("encode_first_stage failed after hires model upscale");
|
||||
LOG_ERROR("encode_first_stage failed after hires %s upscale",
|
||||
sd_hires_upscaler_name(request.hires.upscaler));
|
||||
}
|
||||
return upscaled_latent;
|
||||
}
|
||||
|
||||
265
src/tensor.hpp
265
src/tensor.hpp
@ -815,11 +815,202 @@ namespace sd {
|
||||
namespace ops {
|
||||
enum class InterpolateMode {
|
||||
Nearest,
|
||||
NearestExact,
|
||||
NearestMax,
|
||||
NearestMin,
|
||||
NearestAvg,
|
||||
Bilinear,
|
||||
Bicubic,
|
||||
Lanczos,
|
||||
};
|
||||
|
||||
inline bool is_nearest_like_interpolate_mode(InterpolateMode mode) {
|
||||
return mode == InterpolateMode::Nearest ||
|
||||
mode == InterpolateMode::NearestExact ||
|
||||
mode == InterpolateMode::NearestMax ||
|
||||
mode == InterpolateMode::NearestMin ||
|
||||
mode == InterpolateMode::NearestAvg;
|
||||
}
|
||||
|
||||
inline bool is_2d_filter_interpolate_mode(InterpolateMode mode) {
|
||||
return mode == InterpolateMode::Bilinear ||
|
||||
mode == InterpolateMode::Bicubic ||
|
||||
mode == InterpolateMode::Lanczos;
|
||||
}
|
||||
|
||||
inline int64_t nearest_exact_interpolate_index(int64_t output_index,
|
||||
int64_t input_size,
|
||||
int64_t output_size) {
|
||||
const double scale = static_cast<double>(input_size) / static_cast<double>(output_size);
|
||||
const double center = (static_cast<double>(output_index) + 0.5) * scale - 0.5;
|
||||
return std::min(std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0), input_size - 1);
|
||||
}
|
||||
|
||||
inline double linear_interpolate_weight(double x) {
|
||||
x = std::abs(x);
|
||||
return x < 1.0 ? 1.0 - x : 0.0;
|
||||
}
|
||||
|
||||
inline double cubic_interpolate_weight(double x) {
|
||||
constexpr double a = -0.75; // Match PyTorch bicubic interpolation.
|
||||
x = std::abs(x);
|
||||
if (x <= 1.0) {
|
||||
return ((a + 2.0) * x - (a + 3.0)) * x * x + 1.0;
|
||||
}
|
||||
if (x < 2.0) {
|
||||
return ((a * x - 5.0 * a) * x + 8.0 * a) * x - 4.0 * a;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
inline double sinc(double x) {
|
||||
constexpr double pi = 3.14159265358979323846;
|
||||
if (std::abs(x) < 1e-12) {
|
||||
return 1.0;
|
||||
}
|
||||
const double pix = pi * x;
|
||||
return std::sin(pix) / pix;
|
||||
}
|
||||
|
||||
inline double lanczos_interpolate_weight(double x) {
|
||||
constexpr double radius = 3.0;
|
||||
x = std::abs(x);
|
||||
if (x >= radius) {
|
||||
return 0.0;
|
||||
}
|
||||
return sinc(x) * sinc(x / radius);
|
||||
}
|
||||
|
||||
struct InterpolateContributor {
|
||||
int64_t index;
|
||||
double weight;
|
||||
};
|
||||
|
||||
inline std::vector<std::vector<InterpolateContributor>> make_interpolate_contributors(
|
||||
int64_t input_size,
|
||||
int64_t output_size,
|
||||
InterpolateMode mode,
|
||||
bool antialias) {
|
||||
std::vector<std::vector<InterpolateContributor>> contributors(static_cast<size_t>(output_size));
|
||||
const double scale = static_cast<double>(input_size) / static_cast<double>(output_size);
|
||||
const double filter_scale = antialias ? std::max(1.0, scale) : 1.0;
|
||||
|
||||
for (int64_t out = 0; out < output_size; ++out) {
|
||||
const double center = (static_cast<double>(out) + 0.5) * scale - 0.5;
|
||||
int64_t start = 0;
|
||||
int64_t end = 0;
|
||||
|
||||
if (mode == InterpolateMode::Bilinear) {
|
||||
const double support = filter_scale;
|
||||
start = static_cast<int64_t>(std::ceil(center - support));
|
||||
end = static_cast<int64_t>(std::floor(center + support));
|
||||
} else if (mode == InterpolateMode::Bicubic) {
|
||||
const double support = 2.0 * filter_scale;
|
||||
start = static_cast<int64_t>(std::ceil(center - support));
|
||||
end = static_cast<int64_t>(std::floor(center + support));
|
||||
} else if (mode == InterpolateMode::Lanczos) {
|
||||
const double support = 3.0 * filter_scale;
|
||||
start = static_cast<int64_t>(std::ceil(center - support));
|
||||
end = static_cast<int64_t>(std::floor(center + support));
|
||||
} else {
|
||||
tensor_throw_invalid_argument("Unsupported 2D filter interpolate mode: mode=" +
|
||||
std::to_string(static_cast<int>(mode)));
|
||||
}
|
||||
|
||||
double weight_sum = 0.0;
|
||||
std::vector<InterpolateContributor>& axis_contributors = contributors[static_cast<size_t>(out)];
|
||||
axis_contributors.reserve(static_cast<size_t>(end - start + 1));
|
||||
|
||||
for (int64_t in = start; in <= end; ++in) {
|
||||
double weight = 0.0;
|
||||
if (mode == InterpolateMode::Bilinear) {
|
||||
weight = linear_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
|
||||
} else if (mode == InterpolateMode::Bicubic) {
|
||||
weight = cubic_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
|
||||
} else {
|
||||
weight = lanczos_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
|
||||
}
|
||||
|
||||
if (weight == 0.0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int64_t clamped_index = std::min(std::max<int64_t>(in, 0), input_size - 1);
|
||||
axis_contributors.push_back({clamped_index, weight});
|
||||
weight_sum += weight;
|
||||
}
|
||||
|
||||
if ((antialias || mode == InterpolateMode::Lanczos) &&
|
||||
std::abs(weight_sum) > 1e-12) {
|
||||
for (auto& contributor : axis_contributors) {
|
||||
contributor.weight /= weight_sum;
|
||||
}
|
||||
}
|
||||
|
||||
if (axis_contributors.empty()) {
|
||||
const int64_t nearest = std::min(
|
||||
std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0),
|
||||
input_size - 1);
|
||||
axis_contributors.push_back({nearest, 1.0});
|
||||
}
|
||||
}
|
||||
|
||||
return contributors;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline Tensor<T> interpolate_2d_filter(const Tensor<T>& input,
|
||||
const std::vector<int64_t>& output_shape,
|
||||
InterpolateMode mode,
|
||||
bool antialias) {
|
||||
if (input.dim() < 2) {
|
||||
tensor_throw_invalid_argument("2D filter interpolate requires rank >= 2: input_shape=" +
|
||||
tensor_shape_to_string(input.shape()) + ", output_shape=" +
|
||||
tensor_shape_to_string(output_shape));
|
||||
}
|
||||
for (size_t i = 2; i < output_shape.size(); ++i) {
|
||||
if (input.shape()[i] != output_shape[i]) {
|
||||
tensor_throw_invalid_argument("2D filter interpolate only supports resizing dimensions 0 and 1: input_shape=" +
|
||||
tensor_shape_to_string(input.shape()) + ", output_shape=" +
|
||||
tensor_shape_to_string(output_shape));
|
||||
}
|
||||
}
|
||||
|
||||
Tensor<T> output(output_shape);
|
||||
const int64_t input_width = input.shape()[0];
|
||||
const int64_t input_height = input.shape()[1];
|
||||
const int64_t output_width = output_shape[0];
|
||||
const int64_t output_height = output_shape[1];
|
||||
const int64_t input_plane = input_width * input_height;
|
||||
const int64_t output_plane = output_width * output_height;
|
||||
const int64_t plane_count = input.numel() / input_plane;
|
||||
|
||||
auto x_contributors = make_interpolate_contributors(input_width, output_width, mode, antialias);
|
||||
auto y_contributors = make_interpolate_contributors(input_height, output_height, mode, antialias);
|
||||
|
||||
for (int64_t plane = 0; plane < plane_count; ++plane) {
|
||||
const int64_t input_plane_offset = plane * input_plane;
|
||||
const int64_t output_plane_offset = plane * output_plane;
|
||||
for (int64_t y = 0; y < output_height; ++y) {
|
||||
const auto& y_axis = y_contributors[static_cast<size_t>(y)];
|
||||
for (int64_t x = 0; x < output_width; ++x) {
|
||||
const auto& x_axis = x_contributors[static_cast<size_t>(x)];
|
||||
double value = 0.0;
|
||||
for (const auto& yc : y_axis) {
|
||||
const int64_t input_row_offset = input_plane_offset + yc.index * input_width;
|
||||
for (const auto& xc : x_axis) {
|
||||
value += static_cast<double>(input.data()[input_row_offset + xc.index]) *
|
||||
xc.weight * yc.weight;
|
||||
}
|
||||
}
|
||||
output.data()[output_plane_offset + y * output_width + x] = static_cast<T>(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) {
|
||||
if (index < 0) {
|
||||
index += dim_size;
|
||||
@ -1014,17 +1205,20 @@ namespace sd {
|
||||
inline Tensor<T> interpolate(const Tensor<T>& input,
|
||||
std::vector<int64_t> output_shape,
|
||||
InterpolateMode mode = InterpolateMode::Nearest,
|
||||
bool align_corners = false) {
|
||||
const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
|
||||
mode == InterpolateMode::NearestMax ||
|
||||
mode == InterpolateMode::NearestMin ||
|
||||
mode == InterpolateMode::NearestAvg);
|
||||
if (!is_nearest_like_mode) {
|
||||
tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
|
||||
bool align_corners = false,
|
||||
bool antialias = false) {
|
||||
const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode);
|
||||
const bool is_2d_filter_mode = is_2d_filter_interpolate_mode(mode);
|
||||
if (!is_nearest_like_mode && !is_2d_filter_mode) {
|
||||
tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
|
||||
std::to_string(static_cast<int>(mode)));
|
||||
}
|
||||
if (antialias && !is_2d_filter_mode) {
|
||||
tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
|
||||
std::to_string(static_cast<int>(mode)));
|
||||
}
|
||||
if (align_corners) {
|
||||
tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
|
||||
tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" +
|
||||
tensor_shape_to_string(input.shape()) + ", output_shape=" +
|
||||
tensor_shape_to_string(output_shape));
|
||||
}
|
||||
@ -1051,6 +1245,10 @@ namespace sd {
|
||||
}
|
||||
}
|
||||
|
||||
if (is_2d_filter_mode) {
|
||||
return interpolate_2d_filter(input, output_shape, mode, antialias);
|
||||
}
|
||||
|
||||
bool has_downsampling = false;
|
||||
for (int64_t i = 0; i < input.dim(); ++i) {
|
||||
if (input.shape()[i] > output_shape[i]) {
|
||||
@ -1060,12 +1258,20 @@ namespace sd {
|
||||
}
|
||||
|
||||
Tensor<T> output(std::move(output_shape));
|
||||
if (mode == InterpolateMode::Nearest || !has_downsampling) {
|
||||
if (mode == InterpolateMode::Nearest ||
|
||||
mode == InterpolateMode::NearestExact ||
|
||||
!has_downsampling) {
|
||||
for (int64_t flat = 0; flat < output.numel(); ++flat) {
|
||||
std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
|
||||
std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
|
||||
for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
|
||||
input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
|
||||
if (mode == InterpolateMode::NearestExact) {
|
||||
input_coord[i] = nearest_exact_interpolate_index(output_coord[i],
|
||||
input.shape()[i],
|
||||
output.shape()[i]);
|
||||
} else {
|
||||
input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
|
||||
}
|
||||
}
|
||||
output[flat] = input.index(input_coord);
|
||||
}
|
||||
@ -1083,6 +1289,12 @@ namespace sd {
|
||||
return T(0);
|
||||
case InterpolateMode::Nearest:
|
||||
return T(0);
|
||||
case InterpolateMode::NearestExact:
|
||||
return T(0);
|
||||
case InterpolateMode::Bilinear:
|
||||
case InterpolateMode::Bicubic:
|
||||
case InterpolateMode::Lanczos:
|
||||
break;
|
||||
}
|
||||
|
||||
tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
|
||||
@ -1102,6 +1314,12 @@ namespace sd {
|
||||
break;
|
||||
case InterpolateMode::Nearest:
|
||||
break;
|
||||
case InterpolateMode::NearestExact:
|
||||
break;
|
||||
case InterpolateMode::Bilinear:
|
||||
case InterpolateMode::Bicubic:
|
||||
case InterpolateMode::Lanczos:
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
@ -1157,17 +1375,20 @@ namespace sd {
|
||||
const std::optional<std::vector<int64_t>>& size,
|
||||
const std::optional<std::vector<double>>& scale_factor,
|
||||
InterpolateMode mode = InterpolateMode::Nearest,
|
||||
bool align_corners = false) {
|
||||
const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
|
||||
mode == InterpolateMode::NearestMax ||
|
||||
mode == InterpolateMode::NearestMin ||
|
||||
mode == InterpolateMode::NearestAvg);
|
||||
if (!is_nearest_like_mode) {
|
||||
tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
|
||||
bool align_corners = false,
|
||||
bool antialias = false) {
|
||||
const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode);
|
||||
const bool is_2d_filter_mode = is_2d_filter_interpolate_mode(mode);
|
||||
if (!is_nearest_like_mode && !is_2d_filter_mode) {
|
||||
tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
|
||||
std::to_string(static_cast<int>(mode)));
|
||||
}
|
||||
if (antialias && !is_2d_filter_mode) {
|
||||
tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
|
||||
std::to_string(static_cast<int>(mode)));
|
||||
}
|
||||
if (align_corners) {
|
||||
tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
|
||||
tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" +
|
||||
tensor_shape_to_string(input.shape()));
|
||||
}
|
||||
if (size.has_value() == scale_factor.has_value()) {
|
||||
@ -1211,7 +1432,7 @@ namespace sd {
|
||||
}
|
||||
}
|
||||
|
||||
return interpolate(input, std::move(output_shape), mode, align_corners);
|
||||
return interpolate(input, std::move(output_shape), mode, align_corners, antialias);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -1219,12 +1440,14 @@ namespace sd {
|
||||
const std::optional<std::vector<int64_t>>& size,
|
||||
double scale_factor,
|
||||
InterpolateMode mode = InterpolateMode::Nearest,
|
||||
bool align_corners = false) {
|
||||
bool align_corners = false,
|
||||
bool antialias = false) {
|
||||
return interpolate(input,
|
||||
size,
|
||||
std::vector<double>(size.has_value() ? size->size() : input.dim(), scale_factor),
|
||||
mode,
|
||||
align_corners);
|
||||
align_corners,
|
||||
antialias);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user