refactor: split SDParams to SDCliParams/SDContextParams/SDGenerationParams (#1032)

This commit is contained in:
leejet 2025-12-03 22:31:46 +08:00 committed by GitHub
parent edf2cb3846
commit 5865b5e703
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 1588 additions and 1444 deletions

View File

@ -3,7 +3,21 @@
``` ```
usage: ./bin/sd [options] usage: ./bin/sd [options]
Options: CLI Options:
-o, --output <string> path to write result image to (default: ./output.png)
--preview-path <string> path to write preview image to (default: ./preview.png)
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
every step)
--canny apply canny preprocessor (edge detection)
-v, --verbose print extra info
--color colors the logging tags according to level
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
-h, --help show this help message and exit
Context Options:
-m, --model <string> path to full model -m, --model <string> path to full model
--clip_l <string> path to the clip-l text encoder --clip_l <string> path to the clip-l text encoder
--clip_g <string> path to the clip-g text encoder --clip_g <string> path to the clip-g text encoder
@ -20,25 +34,52 @@ Options:
--control-net <string> path to control net model --control-net <string> path to control net model
--embd-dir <string> embeddings directory --embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory --lora-model-dir <string> lora model directory
-i, --init-img <string> path to the init image
--end-img <string> path to the end image, required by flf2v
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model --photo-maker <string> path to PHOTOMAKER model
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir --upscale-model <string> path to esrgan model.
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed -t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model
--chroma-disable-dit-mask disable dit mask for chroma
--chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
contain any quantized parameters, the at_runtime mode will be used; otherwise,
immediately will be used.The immediately mode may have precision and
compatibility issues with quantized parameters, but it usually offers faster inference
speed and, in some cases, lower memory usage. The at_runtime mode, on the
other hand, is exactly the opposite.
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
Generation Options:
-p, --prompt <string> the prompt to render
-n, --negative-prompt <string> the negative prompt (default: "")
-i, --init-img <string> path to the init image
--end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image --mask <string> path to the mask image
--control-image <string> path to control image, control net --control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in --control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
lexicographical (character) order. For example, if the control video path is lexicographical (character) order. For example, if the control video path is
`frames`, the directory contain images such as 00.png, 01.png, ... etc. `frames`, the directory contain images such as 00.png, 01.png, ... etc.
-o, --output <string> path to write result image to (default: ./output.png) --pm-id-images-dir <string> path to PHOTOMAKER input id images dir
-p, --prompt <string> the prompt to render --pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
-n, --negative-prompt <string> the negative prompt (default: "")
--preview-path <string> path to write preview image to (default: ./preview.png)
--upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
CPU physical cores
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
-H, --height <int> image height, in pixel space (default: 512) -H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512) -W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20) --steps <int> number of sample steps (default: 20)
@ -46,13 +87,11 @@ Options:
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified, --clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
will be 1 for SD1.x, 2 for SD2.x will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count -b, --batch-count <int> batch count
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--video-frames <int> video frames (default: 1) --video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24) --fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for --timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Vibrant NitroSD-Vibrant
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at --upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
every step)
--cfg-scale <float> unconditional guidance scale: (default: 7.0) --cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) --img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5) --guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
@ -72,53 +111,18 @@ Options:
--pm-style-strength <float> --pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image --control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1 --moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--vace-strength <float> wan vace strength --vace-strength <float> wan vace strength
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model
--canny apply canny preprocessor (edge detection)
-v, --verbose print extra info
--color colors the logging tags according to level
--chroma-disable-dit-mask disable dit mask for chroma
--chroma-enable-t5-mask enable t5 mask for chroma
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images --disable-auto-resize-ref-image disable auto resize of ref images
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
-s, --seed RNG seed (default: 42, use random seed for < 0) -s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow] --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
contain any quantized parameters, the at_runtime mode will be used; otherwise,
immediately will be used.The immediately mode may have precision and
compatibility issues with quantized parameters, but it usually offers faster inference
speed and, in some cases, lower memory usage. The at_runtime mode, on the
other hand, is exactly the opposite.
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
default: discrete default: discrete
--skip-layers layers to skip for SLG steps (default: [7,8,9]) --skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) -r, --ref-image reference image for Flux Kontext models (can be used multiple times)
-h, --help show this help message and exit
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
--easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95) --easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
``` ```

File diff suppressed because it is too large Load Diff

View File

@ -2094,12 +2094,12 @@ public:
} }
ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) { ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
ggml_tensor* result = nullptr; ggml_tensor* result = nullptr;
const int vae_scale_factor = get_vae_scale_factor(); const int vae_scale_factor = get_vae_scale_factor();
int W = x->ne[0] / vae_scale_factor; int W = x->ne[0] / vae_scale_factor;
int H = x->ne[1] / vae_scale_factor; int H = x->ne[1] / vae_scale_factor;
int C = get_latent_channel(); int C = get_latent_channel();
if (vae_tiling_params.enabled && !encode_video) { if (vae_tiling_params.enabled && !encode_video) {
// TODO wan2.2 vae support? // TODO wan2.2 vae support?
int ne2; int ne2;
@ -2224,8 +2224,8 @@ public:
const int vae_scale_factor = get_vae_scale_factor(); const int vae_scale_factor = get_vae_scale_factor();
int64_t W = x->ne[0] * vae_scale_factor; int64_t W = x->ne[0] * vae_scale_factor;
int64_t H = x->ne[1] * vae_scale_factor; int64_t H = x->ne[1] * vae_scale_factor;
int64_t C = 3; int64_t C = 3;
ggml_tensor* result = nullptr; ggml_tensor* result = nullptr;
if (decode_video) { if (decode_video) {
int T = x->ne[2]; int T = x->ne[2];
if (sd_version_is_wan(version)) { if (sd_version_is_wan(version)) {

View File

@ -378,19 +378,19 @@ const char* sd_get_system_info() {
static char buffer[1024]; static char buffer[1024];
std::stringstream ss; std::stringstream ss;
ss << "System Info: \n"; ss << "System Info: \n";
ss << " SSE3 = " << ggml_cpu_has_sse3() << std::endl; ss << " SSE3 = " << ggml_cpu_has_sse3() << " | ";
ss << " AVX = " << ggml_cpu_has_avx() << std::endl; ss << " AVX = " << ggml_cpu_has_avx() << " | ";
ss << " AVX2 = " << ggml_cpu_has_avx2() << std::endl; ss << " AVX2 = " << ggml_cpu_has_avx2() << " | ";
ss << " AVX512 = " << ggml_cpu_has_avx512() << std::endl; ss << " AVX512 = " << ggml_cpu_has_avx512() << " | ";
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl; ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl; ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
ss << " FMA = " << ggml_cpu_has_fma() << std::endl; ss << " FMA = " << ggml_cpu_has_fma() << " | ";
ss << " NEON = " << ggml_cpu_has_neon() << std::endl; ss << " NEON = " << ggml_cpu_has_neon() << " | ";
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl; ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
ss << " F16C = " << ggml_cpu_has_f16c() << std::endl; ss << " F16C = " << ggml_cpu_has_f16c() << " | ";
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl; ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl; ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
ss << " VSX = " << ggml_cpu_has_vsx() << std::endl; ss << " VSX = " << ggml_cpu_has_vsx() << " | ";
snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str()); snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
return buffer; return buffer;
} }