2025-12-13 05:48:56 +00:00
4 changed files with 319 additions and 596 deletions
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -1,110 +1,113 @@
 # Run
 ```
-usage: ./bin/sd  [options]
+usage: ./bin/sd [arguments]
-Options:
+arguments:
-  -m, --model <string>                     path to full model
+  -h, --help                         show this help message and exit
-  --clip_l <string>                        path to the clip-l text encoder
+  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen
-  --clip_g <string>                        path to the clip-g text encoder
+  -t, --threads N                    number of threads to use during computation (default: -1)
-  --clip_vision <string>                   path to the clip-vision encoder
+                                     If threads <= 0, then threads will be set to the number of CPU physical cores
-  --t5xxl <string>                         path to the t5xxl text encoder
+  --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
-  --qwen2vl <string>                       path to the qwen2vl text encoder
+  -m, --model [MODEL]                path to full model
-  --qwen2vl_vision <string>                path to the qwen2vl vit
+  --diffusion-model                  path to the standalone diffusion model
-  --diffusion-model <string>               path to the standalone diffusion model
+  --high-noise-diffusion-model       path to the standalone high noise diffusion model
-  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
+  --clip_l                           path to the clip-l text encoder
-  --vae <string>                           path to standalone vae model
+  --clip_g                           path to the clip-g text encoder
-  --taesd <string>                         path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
+  --clip_vision                      path to the clip-vision encoder
-  --control-net <string>                   path to control net model
+  --t5xxl                            path to the t5xxl text encoder
-  --embd-dir <string>                      embeddings directory
+  --qwen2vl                          path to the qwen2vl text encoder
-  --lora-model-dir <string>                lora model directory
+  --qwen2vl_vision                   path to the qwen2vl vit
-  -i, --init-img <string>                  path to the init image
+  --vae [VAE]                        path to vae
-  --end-img <string>                       path to the end image, required by flf2v
+  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
-  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
+  --control-net [CONTROL_PATH]       path to control net model
-  --photo-maker <string>                   path to PHOTOMAKER model
+  --embd-dir [EMBEDDING_PATH]        path to embeddings
-  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
+  --upscale-model [ESRGAN_PATH]      path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
-  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
+  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
-  --mask <string>                          path to the mask image
+  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
-  --control-image <string>                 path to control image, control net
+                                     If not specified, the default is the type of the weight file
-  --control-video <string>                 path to control video frames, It must be a directory path. The video frames inside should be stored as images in
+  --tensor-type-rules [EXPRESSION]   weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
-                                           lexicographical (character) order. For example, if the control video path is
+  --lora-model-dir [DIR]             lora model directory
-                                           `frames`, the directory contain images such as 00.png, 01.png, ... etc.
+  -i, --init-img [IMAGE]             path to the init image, required by img2img
-  -o, --output <string>                    path to write result image to (default: ./output.png)
+  --mask [MASK]                      path to the mask image, required by img2img with mask
-  -p, --prompt <string>                    the prompt to render
+  -i, --end-img [IMAGE]              path to the end image, required by flf2v
-  -n, --negative-prompt <string>           the negative prompt (default: "")
+  --control-image [IMAGE]            path to image condition, control net
-  --upscale-model <string>                 path to esrgan model.
+  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times)
-  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
+  --control-video [PATH]             path to control video frames, It must be a directory path.
-                                           CPU physical cores
+                                     The video frames inside should be stored as images in lexicographical (character) order
-  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
+                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc.
-  -H, --height <int>                       image height, in pixel space (default: 512)
+  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).
-  -W, --width <int>                        image width, in pixel space (default: 512)
+  -o, --output OUTPUT                path to write result image to (default: ./output.png)
-  --steps <int>                            number of sample steps (default: 20)
+  -p, --prompt [PROMPT]              the prompt to render
-  --high-noise-steps <int>                 (high noise) number of sample steps (default: -1 = auto)
+  -n, --negative-prompt PROMPT       the negative prompt (default: "")
-  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
+  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
-                                           will be 1 for SD1.x, 2 for SD2.x
+  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
-  -b, --batch-count <int>                  batch count
+  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)
-  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
+  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)
-  --video-frames <int>                     video frames (default: 1)
+                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium
-  --fps <int>                              fps (default: 24)
+  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)
-  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
+  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])
-                                           NitroSD-Vibrant
+  --skip-layer-start START           SLG enabling point: (default: 0.01)
-  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
+  --skip-layer-end END               SLG disabling point: (default: 0.2)
-  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
+  --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
-  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
+  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
-  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
+                                     sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise)
-                                           medium
+  --timestep-shift N                 shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
-  --skip-layer-start <float>               SLG enabling point (default: 0.01)
+  --steps  STEPS                     number of sample steps (default: 20)
-  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
+  --high-noise-cfg-scale SCALE       (high noise) unconditional guidance scale: (default: 7.0)
-  --eta <float>                            eta in DDIM, only for DDIM and TCD (default: 0)
+  --high-noise-img-cfg-scale SCALE   (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
-  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
+  --high-noise-guidance SCALE        (high noise) distilled guidance scale for models with guidance input (default: 3.5)
-  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
+  --high-noise-slg-scale SCALE       (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
-  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
+                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium
-  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
+  --high-noise-eta SCALE             (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)
-  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
+  --high-noise-skip-layers LAYERS    (high noise) Layers to skip for SLG steps: (default: [7,8,9])
-  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
+  --high-noise-skip-layer-start      (high noise) SLG enabling point: (default: 0.01)
-  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
+  --high-noise-skip-layer-end END    (high noise) SLG disabling point: (default: 0.2)
-  --strength <float>                       strength for noising/unnoising (default: 0.75)
+  --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
-  --pm-style-strength <float>
+  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
-  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
+                                     (high noise) sampling method (default: "euler_a")
-  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
+  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)
-  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
+                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
-  --vace-strength <float>                  wan vace strength
+  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
-  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
+  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
-  --vae-tiling                             process vae in tiles to reduce memory usage
+                                     1.0 corresponds to full destruction of information in init image
-  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
+  -H, --height H                     image height, in pixel space (default: 512)
-  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
+  -W, --width W                      image width, in pixel space (default: 512)
-  --control-net-cpu                        keep controlnet in cpu (for low vram)
+  --rng {std_default, cuda}          RNG (default: cuda)
-  --clip-on-cpu                            keep clip in cpu (for low vram)
+  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
-  --vae-on-cpu                             keep vae in cpu (for low vram)
+  -b, --batch-count COUNT            number of images to generate
-  --diffusion-fa                           use flash attention in the diffusion model
+  --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override
-  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
+  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
-  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
+                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
-  --canny                                  apply canny preprocessor (edge detection)
+  --vae-tiling                       process vae in tiles to reduce memory usage
-  -v, --verbose                            print extra info
+  --vae-tile-size [X]x[Y]            tile size for vae tiling (default: 32x32)
-  --color                                  colors the logging tags according to level
+  --vae-relative-tile-size [X]x[Y]   relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
-  --chroma-disable-dit-mask                disable dit mask for chroma
+  --vae-tile-overlap OVERLAP         tile overlap for vae tiling, in fraction of tile size (default: 0.5)
-  --chroma-enable-t5-mask                  enable t5 mask for chroma
+  --force-sdxl-vae-conv-scale        force use of conv scale on sdxl vae
-  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
+  --vae-on-cpu                       keep vae in cpu (for low vram)
-  --disable-auto-resize-ref-image          disable auto resize of ref images
+  --clip-on-cpu                      keep clip in cpu (for low vram)
-  -M, --mode                               run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
+  --diffusion-fa                     use flash attention in the diffusion model (for low vram)
-  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
+                                     Might lower quality, since it implies converting k and v to f16.
-                                           type of the weight file
+                                     This might crash if it is not supported by the backend.
-  --rng                                    RNG, one of [std_default, cuda], default: cuda
+  --diffusion-conv-direct            use Conv2d direct in the diffusion model
-  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
+                                     This might crash if it is not supported by the backend.
-  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
+  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)
-                                           tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
+                                     This might crash if it is not supported by the backend.
-  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]
+  --control-net-cpu                  keep controlnet in cpu (for low vram)
-  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default:
+  --canny                            apply canny preprocessor (edge detection)
-                                           discrete
+  --color                            colors the logging tags according to level
-  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
+  --chroma-disable-dit-mask          disable dit mask for chroma
-  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
+  --chroma-enable-t5-mask            enable t5 mask for chroma
-                                           ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
+  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma
-  --high-noise-scheduler                   (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform,
+  --video-frames                     video frames (default: 1)
-                                           simple], default: discrete
+  --fps                              fps (default: 24)
-  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
+  --moe-boundary BOUNDARY            timestep boundary for Wan2.2 MoE model. (default: 0.875)
-  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
+                                     only enabled if `--high-noise-steps` is set to -1
-  -h, --help                               show this help message and exit
+  --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)
-  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
+  --vace-strength                    wan vace strength
-  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
+  --photo-maker                      path to PHOTOMAKER model
-                                           (overrides --vae-tile-size)
+  --pm-id-images-dir [DIR]           path to PHOTOMAKER input id images dir
  --pm-id-embed-path [PATH]          path to PHOTOMAKER v2 id embed
  --pm-style-strength                strength for keeping PHOTOMAKER input identity (default: 20)
  -v, --verbose                      print extra info
 ```
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -7,7 +7,6 @@
 #include <map>
 #include <random>
 #include <regex>
 #include <sstream>
 #include <string>
 #include <vector>
@ -81,8 +80,7 @@ struct SDParams {
    std::string control_image_path;
    std::vector<std::string> ref_image_paths;
    std::string control_video_path;
-    bool auto_resize_ref_image = true;
+    bool increase_ref_index = false;
    bool increase_ref_index    = false;
    std::string prompt;
    std::string negative_prompt;
@ -177,7 +175,6 @@ void print_params(SDParams params) {
        printf("        %s\n", path.c_str());
    };
    printf("    control_video_path:                %s\n", params.control_video_path.c_str());
    printf("    auto_resize_ref_image:             %s\n", params.auto_resize_ref_image ? "true" : "false");
    printf("    increase_ref_index:                %s\n", params.increase_ref_index ? "true" : "false");
    printf("    offload_params_to_cpu:             %s\n", params.offload_params_to_cpu ? "true" : "false");
    printf("    clip_on_cpu:                       %s\n", params.clip_on_cpu ? "true" : "false");
@ -214,6 +211,118 @@ void print_params(SDParams params) {
    free(high_noise_sample_params_str);
 }
 void print_usage(int argc, const char* argv[]) {
    printf("usage: %s [arguments]\n", argv[0]);
    printf("\n");
    printf("arguments:\n");
    printf("  -h, --help                         show this help message and exit\n");
    printf("  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen\n");
    printf("  -t, --threads N                    number of threads to use during computation (default: -1)\n");
    printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
    printf("  --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed\n");
    printf("  -m, --model [MODEL]                path to full model\n");
    printf("  --diffusion-model                  path to the standalone diffusion model\n");
    printf("  --high-noise-diffusion-model       path to the standalone high noise diffusion model\n");
    printf("  --clip_l                           path to the clip-l text encoder\n");
    printf("  --clip_g                           path to the clip-g text encoder\n");
    printf("  --clip_vision                      path to the clip-vision encoder\n");
    printf("  --t5xxl                            path to the t5xxl text encoder\n");
    printf("  --qwen2vl                          path to the qwen2vl text encoder\n");
    printf("  --qwen2vl_vision                   path to the qwen2vl vit\n");
    printf("  --vae [VAE]                        path to vae\n");
    printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
    printf("  --control-net [CONTROL_PATH]       path to control net model\n");
    printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings\n");
    printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
    printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
    printf("  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n");
    printf("                                     If not specified, the default is the type of the weight file\n");
    printf("  --tensor-type-rules [EXPRESSION]   weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n");
    printf("  --lora-model-dir [DIR]             lora model directory\n");
    printf("  -i, --init-img [IMAGE]             path to the init image, required by img2img\n");
    printf("  --mask [MASK]                      path to the mask image, required by img2img with mask\n");
    printf("  -i, --end-img [IMAGE]              path to the end image, required by flf2v\n");
    printf("  --control-image [IMAGE]            path to image condition, control net\n");
    printf("  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times) \n");
    printf("  --control-video [PATH]             path to control video frames, It must be a directory path.\n");
    printf("                                     The video frames inside should be stored as images in lexicographical (character) order\n");
    printf("                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, … etc.\n");
    printf("  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
    printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
    printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
    printf("  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
    printf("  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)\n");
    printf("  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
    printf("                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
    printf("  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)\n");
    printf("  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])\n");
    printf("  --skip-layer-start START           SLG enabling point: (default: 0.01)\n");
    printf("  --skip-layer-end END               SLG disabling point: (default: 0.2)\n");
    printf("  --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n");
    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
    printf("                                     sampling method (default: \"euler\" for Flux/SD3/Wan, \"euler_a\" otherwise)\n");
    printf("  --timestep-shift N                 shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant\n");
    printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
    printf("  --high-noise-cfg-scale SCALE       (high noise) unconditional guidance scale: (default: 7.0)\n");
    printf("  --high-noise-img-cfg-scale SCALE   (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
    printf("  --high-noise-guidance SCALE        (high noise) distilled guidance scale for models with guidance input (default: 3.5)\n");
    printf("  --high-noise-slg-scale SCALE       (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
    printf("                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
    printf("  --high-noise-eta SCALE             (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)\n");
    printf("  --high-noise-skip-layers LAYERS    (high noise) Layers to skip for SLG steps: (default: [7,8,9])\n");
    printf("  --high-noise-skip-layer-start      (high noise) SLG enabling point: (default: 0.01)\n");
    printf("  --high-noise-skip-layer-end END    (high noise) SLG disabling point: (default: 0.2)\n");
    printf("  --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n");
    printf("  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
    printf("                                     (high noise) sampling method (default: \"euler_a\")\n");
    printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)\n");
    printf("                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
    printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
    printf("  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)\n");
    printf("                                     1.0 corresponds to full destruction of information in init image\n");
    printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
    printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
    printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
    printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
    printf("  -b, --batch-count COUNT            number of images to generate\n");
    printf("  --prediction {eps, v, edm_v, sd3_flow, flux_flow}        Prediction type override.\n");
    printf("  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
    printf("                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
    printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
    printf("  --vae-tile-size [X]x[Y]            tile size for vae tiling (default: 32x32)\n");
    printf("  --vae-relative-tile-size [X]x[Y]   relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n");
    printf("  --vae-tile-overlap OVERLAP         tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n");
    printf("  --force-sdxl-vae-conv-scale        force use of conv scale on sdxl vae\n");
    printf("  --vae-on-cpu                       keep vae in cpu (for low vram)\n");
    printf("  --clip-on-cpu                      keep clip in cpu (for low vram)\n");
    printf("  --diffusion-fa                     use flash attention in the diffusion model (for low vram)\n");
    printf("                                     Might lower quality, since it implies converting k and v to f16.\n");
    printf("                                     This might crash if it is not supported by the backend.\n");
    printf("  --diffusion-conv-direct            use Conv2d direct in the diffusion model\n");
    printf("                                     This might crash if it is not supported by the backend.\n");
    printf("  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)\n");
    printf("                                     This might crash if it is not supported by the backend.\n");
    printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
    printf("  --canny                            apply canny preprocessor (edge detection)\n");
    printf("  --color                            colors the logging tags according to level\n");
    printf("  --chroma-disable-dit-mask          disable dit mask for chroma\n");
    printf("  --chroma-enable-t5-mask            enable t5 mask for chroma\n");
    printf("  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma\n");
    printf("  --video-frames                     video frames (default: 1)\n");
    printf("  --fps                              fps (default: 24)\n");
    printf("  --moe-boundary BOUNDARY            timestep boundary for Wan2.2 MoE model. (default: 0.875)\n");
    printf("                                     only enabled if `--high-noise-steps` is set to -1\n");
    printf("  --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)\n");
    printf("  --vace-strength                    wan vace strength\n");
    printf("  --photo-maker                      path to PHOTOMAKER model\n");
    printf("  --pm-id-images-dir [DIR]           path to PHOTOMAKER input id images dir\n");
    printf("  --pm-id-embed-path [PATH]          path to PHOTOMAKER v2 id embed\n");
    printf("  --pm-style-strength                strength for keeping PHOTOMAKER input identity (default: 20)\n");
    printf("  -v, --verbose                      print extra info\n");
 }
 #if defined(_WIN32)
 static std::string utf16_to_utf8(const std::wstring& wstr) {
    if (wstr.empty())
@ -383,424 +492,93 @@ bool parse_options(int argc, const char** argv, ArgOptions& options) {
    return true;
 }
 static std::string wrap_text(const std::string& text, size_t width, size_t indent) {
    std::ostringstream oss;
    size_t line_len = 0;
    size_t pos      = 0;
    while (pos < text.size()) {
        // Preserve manual newlines
        if (text[pos] == '\n') {
            oss << '\n'
                << std::string(indent, ' ');
            line_len = indent;
            ++pos;
            continue;
        }
        // Add the character
        oss << text[pos];
        ++line_len;
        ++pos;
        // If the current line exceeds width, try to break at the last space
        if (line_len >= width) {
            std::string current = oss.str();
            size_t back         = current.size();
            // Find the last space (for a clean break)
            while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n')
                --back;
            // If found a space to break on
            if (back > 0 && current[back - 1] != '\n') {
                std::string before = current.substr(0, back - 1);
                std::string after  = current.substr(back);
                oss.str("");
                oss.clear();
                oss << before << "\n"
                    << std::string(indent, ' ') << after;
            } else {
                // If no space found, just break at width
                oss << "\n"
                    << std::string(indent, ' ');
            }
            line_len = indent;
        }
    }
    return oss.str();
 }
 void print_usage(int argc, const char* argv[], const ArgOptions& options) {
    constexpr size_t max_line_width = 120;
    std::cout << "Usage: " << argv[0] << " [options]\n\n";
    std::cout << "Options:\n";
    struct Entry {
        std::string names;
        std::string desc;
    };
    std::vector<Entry> entries;
    auto add_entry = [&](const std::string& s, const std::string& l,
                         const std::string& desc, const std::string& hint = "") {
        std::ostringstream ss;
        if (!s.empty())
            ss << s;
        if (!s.empty() && !l.empty())
            ss << ", ";
        if (!l.empty())
            ss << l;
        if (!hint.empty())
            ss << " " << hint;
        entries.push_back({ss.str(), desc});
    };
    for (auto& o : options.string_options)
        add_entry(o.short_name, o.long_name, o.desc, "<string>");
    for (auto& o : options.int_options)
        add_entry(o.short_name, o.long_name, o.desc, "<int>");
    for (auto& o : options.float_options)
        add_entry(o.short_name, o.long_name, o.desc, "<float>");
    for (auto& o : options.bool_options)
        add_entry(o.short_name, o.long_name, o.desc, "");
    for (auto& o : options.manual_options)
        add_entry(o.short_name, o.long_name, o.desc);
    size_t max_name_width = 0;
    for (auto& e : entries)
        max_name_width = std::max(max_name_width, e.names.size());
    for (auto& e : entries) {
        size_t indent            = 2 + max_name_width + 4;
        size_t desc_width        = (max_line_width > indent ? max_line_width - indent : 40);
        std::string wrapped_desc = wrap_text(e.desc, max_line_width, indent);
        std::cout << "  " << std::left << std::setw(static_cast<int>(max_name_width) + 4)
                  << e.names << wrapped_desc << "\n";
    }
 }
 void parse_args(int argc, const char** argv, SDParams& params) {
    ArgOptions options;
    options.string_options = {
-        {"-m",
+        {"-m", "--model", "", &params.model_path},
-         "--model",
+        {"", "--clip_l", "", &params.clip_l_path},
-         "path to full model",
+        {"", "--clip_g", "", &params.clip_g_path},
-         &params.model_path},
+        {"", "--clip_vision", "", &params.clip_vision_path},
-        {"",
+        {"", "--t5xxl", "", &params.t5xxl_path},
-         "--clip_l",
+        {"", "--qwen2vl", "", &params.qwen2vl_path},
-         "path to the clip-l text encoder", &params.clip_l_path},
+        {"", "--qwen2vl_vision", "", &params.qwen2vl_vision_path},
-        {"", "--clip_g",
+        {"", "--diffusion-model", "", &params.diffusion_model_path},
-         "path to the clip-g text encoder",
+        {"", "--high-noise-diffusion-model", "", &params.high_noise_diffusion_model_path},
-         &params.clip_g_path},
+        {"", "--vae", "", &params.vae_path},
-        {"",
+        {"", "--taesd", "", &params.taesd_path},
-         "--clip_vision",
+        {"", "--control-net", "", &params.control_net_path},
-         "path to the clip-vision encoder",
+        {"", "--embd-dir", "", &params.embedding_dir},
-         &params.clip_vision_path},
+        {"", "--lora-model-dir", "", &params.lora_model_dir},
-        {"",
+        {"-i", "--init-img", "", &params.init_image_path},
-         "--t5xxl",
+        {"", "--end-img", "", &params.end_image_path},
-         "path to the t5xxl text encoder",
+        {"", "--tensor-type-rules", "", &params.tensor_type_rules},
-         &params.t5xxl_path},
+        {"", "--photo-maker", "", &params.photo_maker_path},
-        {"",
+        {"", "--pm-id-images-dir", "", &params.pm_id_images_dir},
-         "--qwen2vl",
+        {"", "--pm-id-embed-path", "", &params.pm_id_embed_path},
-         "path to the qwen2vl text encoder",
+        {"", "--mask", "", &params.mask_image_path},
-         &params.qwen2vl_path},
+        {"", "--control-image", "", &params.control_image_path},
-        {"",
+        {"", "--control-video", "", &params.control_video_path},
-         "--qwen2vl_vision",
+        {"-o", "--output", "", &params.output_path},
-         "path to the qwen2vl vit",
+        {"-p", "--prompt", "", &params.prompt},
-         &params.qwen2vl_vision_path},
+        {"-n", "--negative-prompt", "", &params.negative_prompt},
-        {"",
+        {"", "--upscale-model", "", &params.esrgan_path},
         "--diffusion-model",
         "path to the standalone diffusion model",
         &params.diffusion_model_path},
        {"",
         "--high-noise-diffusion-model",
         "path to the standalone high noise diffusion model",
         &params.high_noise_diffusion_model_path},
        {"",
         "--vae",
         "path to standalone vae model",
         &params.vae_path},
        {"",
         "--taesd",
         "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
         &params.taesd_path},
        {"",
         "--control-net",
         "path to control net model",
         &params.control_net_path},
        {"",
         "--embd-dir",
         "embeddings directory",
         &params.embedding_dir},
        {"",
         "--lora-model-dir",
         "lora model directory",
         &params.lora_model_dir},
        {"-i",
         "--init-img",
         "path to the init image",
         &params.init_image_path},
        {"",
         "--end-img",
         "path to the end image, required by flf2v",
         &params.end_image_path},
        {"",
         "--tensor-type-rules",
         "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
         &params.tensor_type_rules},
        {"",
         "--photo-maker",
         "path to PHOTOMAKER model",
         &params.photo_maker_path},
        {"",
         "--pm-id-images-dir",
         "path to PHOTOMAKER input id images dir",
         &params.pm_id_images_dir},
        {"",
         "--pm-id-embed-path",
         "path to PHOTOMAKER v2 id embed",
         &params.pm_id_embed_path},
        {"",
         "--mask",
         "path to the mask image",
         &params.mask_image_path},
        {"",
         "--control-image",
         "path to control image, control net",
         &params.control_image_path},
        {"",
         "--control-video",
         "path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
         "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
         "such as 00.png, 01.png, ... etc.",
         &params.control_video_path},
        {"-o",
         "--output",
         "path to write result image to (default: ./output.png)",
         &params.output_path},
        {"-p",
         "--prompt",
         "the prompt to render",
         &params.prompt},
        {"-n",
         "--negative-prompt",
         "the negative prompt (default: \"\")",
         &params.negative_prompt},
        {"",
         "--upscale-model",
         "path to esrgan model.",
         &params.esrgan_path},
    };
    options.int_options = {
-        {"-t",
+        {"-t", "--threads", "", &params.n_threads},
-         "--threads",
+        {"", "--upscale-repeats", "", &params.upscale_repeats},
-         "number of threads to use during computation (default: -1). "
+        {"-H", "--height", "", &params.height},
-         "If threads <= 0, then threads will be set to the number of CPU physical cores",
+        {"-W", "--width", "", &params.width},
-         &params.n_threads},
+        {"", "--steps", "", &params.sample_params.sample_steps},
-        {"",
+        {"", "--high-noise-steps", "", &params.high_noise_sample_params.sample_steps},
-         "--upscale-repeats",
+        {"", "--clip-skip", "", &params.clip_skip},
-         "Run the ESRGAN upscaler this many times (default: 1)",
+        {"-b", "--batch-count", "", &params.batch_count},
-         &params.upscale_repeats},
+        {"", "--chroma-t5-mask-pad", "", &params.chroma_t5_mask_pad},
-        {"-H",
+        {"", "--video-frames", "", &params.video_frames},
-         "--height",
+        {"", "--fps", "", &params.fps},
-         "image height, in pixel space (default: 512)",
+        {"", "--timestep-shift", "", &params.sample_params.shifted_timestep},
         &params.height},
        {"-W",
         "--width",
         "image width, in pixel space (default: 512)",
         &params.width},
        {"",
         "--steps",
         "number of sample steps (default: 20)",
         &params.sample_params.sample_steps},
        {"",
         "--high-noise-steps",
         "(high noise) number of sample steps (default: -1 = auto)",
         &params.high_noise_sample_params.sample_steps},
        {"",
         "--clip-skip",
         "ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). "
         "<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x",
         &params.clip_skip},
        {"-b",
         "--batch-count",
         "batch count",
         &params.batch_count},
        {"",
         "--chroma-t5-mask-pad",
         "t5 mask pad size of chroma",
         &params.chroma_t5_mask_pad},
        {"",
         "--video-frames",
         "video frames (default: 1)",
         &params.video_frames},
        {"",
         "--fps",
         "fps (default: 24)",
         &params.fps},
        {"",
         "--timestep-shift",
         "shift timestep for NitroFusion models (default: 0). "
         "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant",
         &params.sample_params.shifted_timestep},
    };
    options.float_options = {
-        {"",
+        {"", "--cfg-scale", "", &params.sample_params.guidance.txt_cfg},
-         "--cfg-scale",
+        {"", "--img-cfg-scale", "", &params.sample_params.guidance.img_cfg},
-         "unconditional guidance scale: (default: 7.0)",
+        {"", "--guidance", "", &params.sample_params.guidance.distilled_guidance},
-         &params.sample_params.guidance.txt_cfg},
+        {"", "--slg-scale", "", &params.sample_params.guidance.slg.scale},
-        {"",
+        {"", "--skip-layer-start", "", &params.sample_params.guidance.slg.layer_start},
-         "--img-cfg-scale",
+        {"", "--skip-layer-end", "", &params.sample_params.guidance.slg.layer_end},
-         "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)",
+        {"", "--eta", "", &params.sample_params.eta},
-         &params.sample_params.guidance.img_cfg},
+        {"", "--high-noise-cfg-scale", "", &params.high_noise_sample_params.guidance.txt_cfg},
-        {"",
+        {"", "--high-noise-img-cfg-scale", "", &params.high_noise_sample_params.guidance.img_cfg},
-         "--guidance",
+        {"", "--high-noise-guidance", "", &params.high_noise_sample_params.guidance.distilled_guidance},
-         "distilled guidance scale for models with guidance input (default: 3.5)",
+        {"", "--high-noise-slg-scale", "", &params.high_noise_sample_params.guidance.slg.scale},
-         &params.sample_params.guidance.distilled_guidance},
+        {"", "--high-noise-skip-layer-start", "", &params.high_noise_sample_params.guidance.slg.layer_start},
-        {"",
+        {"", "--high-noise-skip-layer-end", "", &params.high_noise_sample_params.guidance.slg.layer_end},
-         "--slg-scale",
+        {"", "--high-noise-eta", "", &params.high_noise_sample_params.eta},
-         "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium",
+        {"", "--strength", "", &params.strength},
-         &params.sample_params.guidance.slg.scale},
+        {"", "--pm-style-strength", "", &params.pm_style_strength},
-        {"",
+        {"", "--control-strength", "", &params.control_strength},
-         "--skip-layer-start",
+        {"", "--moe-boundary", "", &params.moe_boundary},
-         "SLG enabling point (default: 0.01)",
+        {"", "--flow-shift", "", &params.flow_shift},
-         &params.sample_params.guidance.slg.layer_start},
+        {"", "--vace-strength", "", &params.vace_strength},
-        {"",
+        {"", "--vae-tile-overlap", "", &params.vae_tiling_params.target_overlap},
         "--skip-layer-end",
         "SLG disabling point (default: 0.2)",
         &params.sample_params.guidance.slg.layer_end},
        {"",
         "--eta",
         "eta in DDIM, only for DDIM and TCD (default: 0)",
         &params.sample_params.eta},
        {"",
         "--high-noise-cfg-scale",
         "(high noise) unconditional guidance scale: (default: 7.0)",
         &params.high_noise_sample_params.guidance.txt_cfg},
        {"",
         "--high-noise-img-cfg-scale",
         "(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)",
         &params.high_noise_sample_params.guidance.img_cfg},
        {"",
         "--high-noise-guidance",
         "(high noise) distilled guidance scale for models with guidance input (default: 3.5)",
         &params.high_noise_sample_params.guidance.distilled_guidance},
        {"",
         "--high-noise-slg-scale",
         "(high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)",
         &params.high_noise_sample_params.guidance.slg.scale},
        {"",
         "--high-noise-skip-layer-start",
         "(high noise) SLG enabling point (default: 0.01)",
         &params.high_noise_sample_params.guidance.slg.layer_start},
        {"",
         "--high-noise-skip-layer-end",
         "(high noise) SLG disabling point (default: 0.2)",
         &params.high_noise_sample_params.guidance.slg.layer_end},
        {"",
         "--high-noise-eta",
         "(high noise) eta in DDIM, only for DDIM and TCD (default: 0)",
         &params.high_noise_sample_params.eta},
        {"",
         "--strength",
         "strength for noising/unnoising (default: 0.75)",
         &params.strength},
        {"",
         "--pm-style-strength",
         "",
         &params.pm_style_strength},
        {"",
         "--control-strength",
         "strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image",
         &params.control_strength},
        {"",
         "--moe-boundary",
         "timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1",
         &params.moe_boundary},
        {"",
         "--flow-shift",
         "shift value for Flow models like SD3.x or WAN (default: auto)",
         &params.flow_shift},
        {"",
         "--vace-strength",
         "wan vace strength",
         &params.vace_strength},
        {"",
         "--vae-tile-overlap",
         "tile overlap for vae tiling, in fraction of tile size (default: 0.5)",
         &params.vae_tiling_params.target_overlap},
    };
    options.bool_options = {
-        {"",
+        {"", "--vae-tiling", "", true, &params.vae_tiling_params.enabled},
-         "--vae-tiling",
+        {"", "--force-sdxl-vae-conv-scale", "", true, &params.force_sdxl_vae_conv_scale},
-         "process vae in tiles to reduce memory usage",
+        {"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu},
-         true, &params.vae_tiling_params.enabled},
+        {"", "--control-net-cpu", "", true, &params.control_net_cpu},
-        {"",
+        {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
-         "--force-sdxl-vae-conv-scale",
+        {"", "--vae-on-cpu", "", true, &params.vae_on_cpu},
-         "force use of conv scale on sdxl vae",
+        {"", "--diffusion-fa", "", true, &params.diffusion_flash_attn},
-         true, &params.force_sdxl_vae_conv_scale},
+        {"", "--diffusion-conv-direct", "", true, &params.diffusion_conv_direct},
-        {"",
+        {"", "--vae-conv-direct", "", true, &params.vae_conv_direct},
-         "--offload-to-cpu",
+        {"", "--canny", "", true, &params.canny_preprocess},
-         "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed",
+        {"-v", "--verbose", "", true, &params.verbose},
-         true, &params.offload_params_to_cpu},
+        {"", "--color", "", true, &params.color},
-        {"",
+        {"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
-         "--control-net-cpu",
+        {"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
-         "keep controlnet in cpu (for low vram)",
+        {"", "--increase-ref-index", "", true, &params.increase_ref_index},
         true, &params.control_net_cpu},
        {"",
         "--clip-on-cpu",
         "keep clip in cpu (for low vram)",
         true, &params.clip_on_cpu},
        {"",
         "--vae-on-cpu",
         "keep vae in cpu (for low vram)",
         true, &params.vae_on_cpu},
        {"",
         "--diffusion-fa",
         "use flash attention in the diffusion model",
         true, &params.diffusion_flash_attn},
        {"",
         "--diffusion-conv-direct",
         "use ggml_conv2d_direct in the diffusion model",
         true, &params.diffusion_conv_direct},
        {"",
         "--vae-conv-direct",
         "use ggml_conv2d_direct in the vae model",
         true, &params.vae_conv_direct},
        {"",
         "--canny",
         "apply canny preprocessor (edge detection)",
         true, &params.canny_preprocess},
        {"-v",
         "--verbose",
         "print extra info",
         true, &params.verbose},
        {"",
         "--color",
         "colors the logging tags according to level",
         true, &params.color},
        {"",
         "--chroma-disable-dit-mask",
         "disable dit mask for chroma",
         false, &params.chroma_use_dit_mask},
        {"",
         "--chroma-enable-t5-mask",
         "enable t5 mask for chroma",
         true, &params.chroma_use_t5_mask},
        {"",
         "--increase-ref-index",
         "automatically increase the indices of references images based on the order they are listed (starting with 1).",
         true, &params.increase_ref_index},
        {"",
         "--disable-auto-resize-ref-image",
         "disable auto resize of ref images",
         false, &params.auto_resize_ref_image},
    };
    auto on_mode_arg = [&](int argc, const char** argv, int index) {
@ -933,7 +711,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
    };
    auto on_help_arg = [&](int argc, const char** argv, int index) {
-        print_usage(argc, argv, options);
+        print_usage(argc, argv);
        exit(0);
        return 0;
    };
@ -1047,73 +825,25 @@ void parse_args(int argc, const char** argv, SDParams& params) {
    };
    options.manual_options = {
-        {"-M",
+        {"-M", "--mode", "", on_mode_arg},
-         "--mode",
+        {"", "--type", "", on_type_arg},
-         "run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen",
+        {"", "--rng", "", on_rng_arg},
-         on_mode_arg},
+        {"-s", "--seed", "", on_seed_arg},
-        {"",
+        {"", "--sampling-method", "", on_sample_method_arg},
-         "--type",
+        {"", "--prediction", "", on_prediction_arg},
-         "weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). "
+        {"", "--scheduler", "", on_schedule_arg},
-         "If not specified, the default is the type of the weight file",
+        {"", "--skip-layers", "", on_skip_layers_arg},
-         on_type_arg},
+        {"", "--high-noise-sampling-method", "", on_high_noise_sample_method_arg},
-        {"",
+        {"", "--high-noise-scheduler", "", on_high_noise_schedule_arg},
-         "--rng",
+        {"", "--high-noise-skip-layers", "", on_high_noise_skip_layers_arg},
-         "RNG, one of [std_default, cuda], default: cuda",
+        {"-r", "--ref-image", "", on_ref_image_arg},
-         on_rng_arg},
+        {"-h", "--help", "", on_help_arg},
-        {"-s",
+        {"", "--vae-tile-size", "", on_tile_size_arg},
-         "--seed",
+        {"", "--vae-relative-tile-size", "", on_relative_tile_size_arg},
         "RNG seed (default: 42, use random seed for < 0)",
         on_seed_arg},
        {"",
         "--sampling-method",
         "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] "
         "(default: euler for Flux/SD3/Wan, euler_a otherwise)",
         on_sample_method_arg},
        {"",
         "--prediction",
         "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]",
         on_prediction_arg},
        {"",
         "--scheduler",
         "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete",
         on_schedule_arg},
        {"",
         "--skip-layers",
         "layers to skip for SLG steps (default: [7,8,9])",
         on_skip_layers_arg},
        {"",
         "--high-noise-sampling-method",
         "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]"
         " default: euler for Flux/SD3/Wan, euler_a otherwise",
         on_high_noise_sample_method_arg},
        {"",
         "--high-noise-scheduler",
         "(high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete",
         on_high_noise_schedule_arg},
        {"",
         "--high-noise-skip-layers",
         "(high noise) layers to skip for SLG steps (default: [7,8,9])",
         on_high_noise_skip_layers_arg},
        {"-r",
         "--ref-image",
         "reference image for Flux Kontext models (can be used multiple times)",
         on_ref_image_arg},
        {"-h",
         "--help",
         "show this help message and exit",
         on_help_arg},
        {"",
         "--vae-tile-size",
         "tile size for vae tiling, format [X]x[Y] (default: 32x32)",
         on_tile_size_arg},
        {"",
         "--vae-relative-tile-size",
         "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
         on_relative_tile_size_arg},
    };
    if (!parse_options(argc, argv, options)) {
-        print_usage(argc, argv, options);
+        print_usage(argc, argv);
        exit(1);
    }
@ -1123,19 +853,19 @@ void parse_args(int argc, const char** argv, SDParams& params) {
    if ((params.mode == IMG_GEN || params.mode == VID_GEN) && params.prompt.length() == 0) {
        fprintf(stderr, "error: the following arguments are required: prompt\n");
-        print_usage(argc, argv, options);
+        print_usage(argc, argv);
        exit(1);
    }
    if (params.mode != UPSCALE && params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) {
        fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
-        print_usage(argc, argv, options);
+        print_usage(argc, argv);
        exit(1);
    }
    if (params.output_path.length() == 0) {
        fprintf(stderr, "error: the following arguments are required: output_path\n");
-        print_usage(argc, argv, options);
+        print_usage(argc, argv);
        exit(1);
    }
@ -1698,7 +1428,6 @@ int main(int argc, const char* argv[]) {
                init_image,
                ref_images.data(),
                (int)ref_images.size(),
                params.auto_resize_ref_image,
                params.increase_ref_index,
                mask_image,
                params.width,
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -1970,7 +1970,6 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
             "seed: %" PRId64
             "batch_count: %d\n"
             "ref_images_count: %d\n"
             "auto_resize_ref_image: %s\n"
             "increase_ref_index: %s\n"
             "control_strength: %.2f\n"
             "photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n"
@ -1985,7 +1984,6 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
             sd_img_gen_params->seed,
             sd_img_gen_params->batch_count,
             sd_img_gen_params->ref_images_count,
             BOOL_STR(sd_img_gen_params->auto_resize_ref_image),
             BOOL_STR(sd_img_gen_params->increase_ref_index),
             sd_img_gen_params->control_strength,
             sd_img_gen_params->pm_params.style_strength,
@ -2626,20 +2624,14 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
    std::vector<ggml_tensor*> ref_latents;
    for (int i = 0; i < ref_images.size(); i++) {
        ggml_tensor* img;
-        if (sd_img_gen_params->auto_resize_ref_image) {
+        if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
            LOG_DEBUG("auto resize ref images");
            sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]);
            int VAE_IMAGE_SIZE       = std::min(1024 * 1024, width * height);
            double vae_width         = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height);
            double vae_height        = vae_width * ref_image.height / ref_image.width;
-            int factor = 16;
+            vae_height = round(vae_height / 32) * 32;
-            if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
+            vae_width  = round(vae_width / 32) * 32;
                factor = 32;
            }
            vae_height = round(vae_height / factor) * factor;
            vae_width  = round(vae_width / factor) * factor;
            sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height));
            free(ref_image.data);
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -216,7 +216,6 @@ typedef struct {
    sd_image_t init_image;
    sd_image_t* ref_images;
    int ref_images_count;
    bool auto_resize_ref_image;
    bool increase_ref_index;
    sd_image_t mask_image;
    int width;