refactor: split SDParams to SDCliParams/SDContextParams/SDGenerationParams (#1032)

2026-02-04 19:03:35 +00:00 · 2025-12-03 22:31:46 +08:00 · 2025-12-03 22:31:46 +08:00 · 5865b5e703
commit 5865b5e703
parent edf2cb3846
4 changed files with 1588 additions and 1444 deletions
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -3,7 +3,21 @@
 ```
 usage: ./bin/sd  [options]
-Options:
+CLI Options:
  -o, --output <string>       path to write result image to (default: ./output.png)
  --preview-path <string>     path to write preview image to (default: ./preview.png)
  --preview-interval <int>    interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
                              every step)
  --canny                     apply canny preprocessor (edge detection)
  -v, --verbose               print extra info
  --color                     colors the logging tags according to level
  --taesd-preview-only        prevents usage of taesd for decoding the final image. (for use with --preview tae)
  --preview-noisy             enables previewing noisy inputs of the models rather than the denoised outputs
  -M, --mode                  run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
  --preview                   preview method. must be one of the following [none, proj, tae, vae] (default is none)
  -h, --help                  show this help message and exit
 Context Options:
  -m, --model <string>                     path to full model
  --clip_l <string>                        path to the clip-l text encoder
  --clip_g <string>                        path to the clip-g text encoder
@ -20,25 +34,52 @@ Options:
  --control-net <string>                   path to control net model
  --embd-dir <string>                      embeddings directory
  --lora-model-dir <string>                lora model directory
  -i, --init-img <string>                  path to the init image
  --end-img <string>                       path to the end image, required by flf2v
  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
  --photo-maker <string>                   path to PHOTOMAKER model
-  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
+  --upscale-model <string>                 path to esrgan model.
-  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
+  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
                                           CPU physical cores
  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --vae-tiling                             process vae in tiles to reduce memory usage
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
  --control-net-cpu                        keep controlnet in cpu (for low vram)
  --clip-on-cpu                            keep clip in cpu (for low vram)
  --vae-on-cpu                             keep vae in cpu (for low vram)
  --diffusion-fa                           use flash attention in the diffusion model
  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
  --chroma-disable-dit-mask                disable dit mask for chroma
  --chroma-enable-t5-mask                  enable t5 mask for chroma
  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                           type of the weight file
  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
                                           immediately will be used.The immediately mode may have precision and
                                           compatibility issues with quantized parameters, but it usually offers faster inference
                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
                                           other hand, is exactly the opposite.
  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
                                           (overrides --vae-tile-size)
 Generation Options:
  -p, --prompt <string>                    the prompt to render
  -n, --negative-prompt <string>           the negative prompt (default: "")
  -i, --init-img <string>                  path to the init image
  --end-img <string>                       path to the end image, required by flf2v
  --mask <string>                          path to the mask image
  --control-image <string>                 path to control image, control net
  --control-video <string>                 path to control video frames, It must be a directory path. The video frames inside should be stored as images in
                                           lexicographical (character) order. For example, if the control video path is
                                           `frames`, the directory contain images such as 00.png, 01.png, ... etc.
-  -o, --output <string>                    path to write result image to (default: ./output.png)
+  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
-  -p, --prompt <string>                    the prompt to render
+  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
  -n, --negative-prompt <string>           the negative prompt (default: "")
  --preview-path <string>                  path to write preview image to (default: ./preview.png)
  --upscale-model <string>                 path to esrgan model.
  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
                                           CPU physical cores
  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
  -H, --height <int>                       image height, in pixel space (default: 512)
  -W, --width <int>                        image width, in pixel space (default: 512)
  --steps <int>                            number of sample steps (default: 20)
@ -46,13 +87,11 @@ Options:
  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
                                           will be 1 for SD1.x, 2 for SD2.x
  -b, --batch-count <int>                  batch count
  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
  --video-frames <int>                     video frames (default: 1)
  --fps <int>                              fps (default: 24)
  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
                                           NitroSD-Vibrant
-  --preview-interval <int>                 interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
+  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
                                           every step)
  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
@ -72,53 +111,18 @@ Options:
  --pm-style-strength <float>
  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --vace-strength <float>                  wan vace strength
  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
  --vae-tiling                             process vae in tiles to reduce memory usage
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
  --control-net-cpu                        keep controlnet in cpu (for low vram)
  --clip-on-cpu                            keep clip in cpu (for low vram)
  --vae-on-cpu                             keep vae in cpu (for low vram)
  --diffusion-fa                           use flash attention in the diffusion model
  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
  --canny                                  apply canny preprocessor (edge detection)
  -v, --verbose                            print extra info
  --color                                  colors the logging tags according to level
  --chroma-disable-dit-mask                disable dit mask for chroma
  --chroma-enable-t5-mask                  enable t5 mask for chroma
  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
  --disable-auto-resize-ref-image          disable auto resize of ref images
  --taesd-preview-only                     prevents usage of taesd for decoding the final image. (for use with --preview tae)
  --preview-noisy                          enables previewing noisy inputs of the models rather than the denoised outputs
  -M, --mode                               run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                           type of the weight file
  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
                                           tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
-  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
+  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
-  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
+                                           ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
                                           immediately will be used.The immediately mode may have precision and
                                           compatibility issues with quantized parameters, but it usually offers faster inference
                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
                                           other hand, is exactly the opposite.
  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
                                           default: discrete
  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
                                           ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
  -h, --help                               show this help message and exit
  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
                                           (overrides --vae-tile-size)
  --preview                                preview method. must be one of the following [none, proj, tae, vae] (default is none)
  --easycache                              enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
 ```
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -2094,12 +2094,12 @@ public:
    }
    ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
-        int64_t t0          = ggml_time_ms();
+        int64_t t0                 = ggml_time_ms();
-        ggml_tensor* result = nullptr;
+        ggml_tensor* result        = nullptr;
        const int vae_scale_factor = get_vae_scale_factor();
        int W                      = x->ne[0] / vae_scale_factor;
        int H                      = x->ne[1] / vae_scale_factor;
-        int C               = get_latent_channel();
+        int C                      = get_latent_channel();
        if (vae_tiling_params.enabled && !encode_video) {
            // TODO wan2.2 vae support?
            int ne2;
@ -2224,8 +2224,8 @@ public:
        const int vae_scale_factor = get_vae_scale_factor();
        int64_t W                  = x->ne[0] * vae_scale_factor;
        int64_t H                  = x->ne[1] * vae_scale_factor;
-        int64_t C           = 3;
+        int64_t C                  = 3;
-        ggml_tensor* result = nullptr;
+        ggml_tensor* result        = nullptr;
        if (decode_video) {
            int T = x->ne[2];
            if (sd_version_is_wan(version)) {
--- a/util.cpp
+++ b/util.cpp
@ -378,19 +378,19 @@ const char* sd_get_system_info() {
    static char buffer[1024];
    std::stringstream ss;
    ss << "System Info: \n";
-    ss << "    SSE3 = " << ggml_cpu_has_sse3() << std::endl;
+    ss << "    SSE3 = " << ggml_cpu_has_sse3() << " | ";
-    ss << "    AVX = " << ggml_cpu_has_avx() << std::endl;
+    ss << "    AVX = " << ggml_cpu_has_avx() << " | ";
-    ss << "    AVX2 = " << ggml_cpu_has_avx2() << std::endl;
+    ss << "    AVX2 = " << ggml_cpu_has_avx2() << " | ";
-    ss << "    AVX512 = " << ggml_cpu_has_avx512() << std::endl;
+    ss << "    AVX512 = " << ggml_cpu_has_avx512() << " | ";
-    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl;
+    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
-    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl;
+    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
-    ss << "    FMA = " << ggml_cpu_has_fma() << std::endl;
+    ss << "    FMA = " << ggml_cpu_has_fma() << " | ";
-    ss << "    NEON = " << ggml_cpu_has_neon() << std::endl;
+    ss << "    NEON = " << ggml_cpu_has_neon() << " | ";
-    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl;
+    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
-    ss << "    F16C = " << ggml_cpu_has_f16c() << std::endl;
+    ss << "    F16C = " << ggml_cpu_has_f16c() << " | ";
-    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl;
+    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
-    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl;
+    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
-    ss << "    VSX = " << ggml_cpu_has_vsx() << std::endl;
+    ss << "    VSX = " << ggml_cpu_has_vsx() << " | ";
    snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
    return buffer;
 }