Compare commits

...

2 Commits

4 changed files with 596 additions and 319 deletions

View File

@ -1,113 +1,110 @@
# Run # Run
``` ```
usage: ./bin/sd [arguments] usage: ./bin/sd [options]
arguments: Options:
-h, --help show this help message and exit -m, --model <string> path to full model
-M, --mode [MODE] run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen --clip_l <string> path to the clip-l text encoder
-t, --threads N number of threads to use during computation (default: -1) --clip_g <string> path to the clip-g text encoder
If threads <= 0, then threads will be set to the number of CPU physical cores --clip_vision <string> path to the clip-vision encoder
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed --t5xxl <string> path to the t5xxl text encoder
-m, --model [MODEL] path to full model --qwen2vl <string> path to the qwen2vl text encoder
--diffusion-model path to the standalone diffusion model --qwen2vl_vision <string> path to the qwen2vl vit
--high-noise-diffusion-model path to the standalone high noise diffusion model --diffusion-model <string> path to the standalone diffusion model
--clip_l path to the clip-l text encoder --high-noise-diffusion-model <string> path to the standalone high noise diffusion model
--clip_g path to the clip-g text encoder --vae <string> path to standalone vae model
--clip_vision path to the clip-vision encoder --taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--t5xxl path to the t5xxl text encoder --control-net <string> path to control net model
--qwen2vl path to the qwen2vl text encoder --embd-dir <string> embeddings directory
--qwen2vl_vision path to the qwen2vl vit --lora-model-dir <string> lora model directory
--vae [VAE] path to vae -i, --init-img <string> path to the init image
--taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality) --end-img <string> path to the end image, required by flf2v
--control-net [CONTROL_PATH] path to control net model --tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--embd-dir [EMBEDDING_PATH] path to embeddings --photo-maker <string> path to PHOTOMAKER model
--upscale-model [ESRGAN_PATH] path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now --pm-id-images-dir <string> path to PHOTOMAKER input id images dir
--upscale-repeats Run the ESRGAN upscaler this many times (default 1) --pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
--type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K) --mask <string> path to the mask image
If not specified, the default is the type of the weight file --control-image <string> path to control image, control net
--tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
--lora-model-dir [DIR] lora model directory lexicographical (character) order. For example, if the control video path is
-i, --init-img [IMAGE] path to the init image, required by img2img `frames`, the directory contain images such as 00.png, 01.png, ... etc.
--mask [MASK] path to the mask image, required by img2img with mask -o, --output <string> path to write result image to (default: ./output.png)
-i, --end-img [IMAGE] path to the end image, required by flf2v -p, --prompt <string> the prompt to render
--control-image [IMAGE] path to image condition, control net -n, --negative-prompt <string> the negative prompt (default: "")
-r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) --upscale-model <string> path to esrgan model.
--control-video [PATH] path to control video frames, It must be a directory path. -t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
The video frames inside should be stored as images in lexicographical (character) order CPU physical cores
For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc. --upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). -H, --height <int> image height, in pixel space (default: 512)
-o, --output OUTPUT path to write result image to (default: ./output.png) -W, --width <int> image width, in pixel space (default: 512)
-p, --prompt [PROMPT] the prompt to render --steps <int> number of sample steps (default: 20)
-n, --negative-prompt PROMPT the negative prompt (default: "") --high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
--cfg-scale SCALE unconditional guidance scale: (default: 7.0) --clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
--img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) will be 1 for SD1.x, 2 for SD2.x
--guidance SCALE distilled guidance scale for models with guidance input (default: 3.5) -b, --batch-count <int> batch count
--slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0) --chroma-t5-mask-pad <int> t5 mask pad size of chroma
0 means disabled, a value of 2.5 is nice for sd3.5 medium --video-frames <int> video frames (default: 1)
--eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0) --fps <int> fps (default: 24)
--skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9]) --timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
--skip-layer-start START SLG enabling point: (default: 0.01) NitroSD-Vibrant
--skip-layer-end END SLG disabling point: (default: 0.2) --cfg-scale <float> unconditional guidance scale: (default: 7.0)
--scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete) --img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd} --guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise) --slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
--timestep-shift N shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant medium
--steps STEPS number of sample steps (default: 20) --skip-layer-start <float> SLG enabling point (default: 0.01)
--high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0) --skip-layer-end <float> SLG disabling point (default: 0.2)
--high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) --eta <float> eta in DDIM, only for DDIM and TCD (default: 0)
--high-noise-guidance SCALE (high noise) distilled guidance scale for models with guidance input (default: 3.5) --high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-slg-scale SCALE (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0) --high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
0 means disabled, a value of 2.5 is nice for sd3.5 medium --high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
--high-noise-eta SCALE (high noise) eta in DDIM, only for DDIM and TCD: (default: 0) --high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
--high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9]) --high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
--high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01) --high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
--high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2) --high-noise-eta <float> (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
--high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete) --strength <float> strength for noising/unnoising (default: 0.75)
--high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd} --pm-style-strength <float>
(high noise) sampling method (default: "euler_a") --control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
--high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto) --moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END]) --flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--strength STRENGTH strength for noising/unnoising (default: 0.75) --vace-strength <float> wan vace strength
--control-strength STRENGTH strength to apply Control Net (default: 0.9) --vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
1.0 corresponds to full destruction of information in init image --vae-tiling process vae in tiles to reduce memory usage
-H, --height H image height, in pixel space (default: 512) --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
-W, --width W image width, in pixel space (default: 512) --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--rng {std_default, cuda} RNG (default: cuda) --control-net-cpu keep controlnet in cpu (for low vram)
-s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0) --clip-on-cpu keep clip in cpu (for low vram)
-b, --batch-count COUNT number of images to generate --vae-on-cpu keep vae in cpu (for low vram)
--prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override --diffusion-fa use flash attention in the diffusion model
--clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1) --diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x --vae-conv-direct use ggml_conv2d_direct in the vae model
--vae-tiling process vae in tiles to reduce memory usage --canny apply canny preprocessor (edge detection)
--vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32) -v, --verbose print extra info
--vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size) --color colors the logging tags according to level
--vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5) --chroma-disable-dit-mask disable dit mask for chroma
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --chroma-enable-t5-mask enable t5 mask for chroma
--vae-on-cpu keep vae in cpu (for low vram) --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--clip-on-cpu keep clip in cpu (for low vram) --disable-auto-resize-ref-image disable auto resize of ref images
--diffusion-fa use flash attention in the diffusion model (for low vram) -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
Might lower quality, since it implies converting k and v to f16. --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
This might crash if it is not supported by the backend. type of the weight file
--diffusion-conv-direct use Conv2d direct in the diffusion model --rng RNG, one of [std_default, cuda], default: cuda
This might crash if it is not supported by the backend. -s, --seed RNG seed (default: 42, use random seed for < 0)
--vae-conv-direct use Conv2d direct in the vae model (should improve the performance) --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
This might crash if it is not supported by the backend. tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
--control-net-cpu keep controlnet in cpu (for low vram) --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]
--canny apply canny preprocessor (edge detection) --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default:
--color colors the logging tags according to level discrete
--chroma-disable-dit-mask disable dit mask for chroma --skip-layers layers to skip for SLG steps (default: [7,8,9])
--chroma-enable-t5-mask enable t5 mask for chroma --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
--chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
--video-frames video frames (default: 1) --high-noise-scheduler (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform,
--fps fps (default: 24) simple], default: discrete
--moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
only enabled if `--high-noise-steps` is set to -1 -r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto) -h, --help show this help message and exit
--vace-strength wan vace strength --vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--photo-maker path to PHOTOMAKER model --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
--pm-id-images-dir [DIR] path to PHOTOMAKER input id images dir (overrides --vae-tile-size)
--pm-id-embed-path [PATH] path to PHOTOMAKER v2 id embed
--pm-style-strength strength for keeping PHOTOMAKER input identity (default: 20)
-v, --verbose print extra info
``` ```

View File

@ -7,6 +7,7 @@
#include <map> #include <map>
#include <random> #include <random>
#include <regex> #include <regex>
#include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
@ -80,7 +81,8 @@ struct SDParams {
std::string control_image_path; std::string control_image_path;
std::vector<std::string> ref_image_paths; std::vector<std::string> ref_image_paths;
std::string control_video_path; std::string control_video_path;
bool increase_ref_index = false; bool auto_resize_ref_image = true;
bool increase_ref_index = false;
std::string prompt; std::string prompt;
std::string negative_prompt; std::string negative_prompt;
@ -175,6 +177,7 @@ void print_params(SDParams params) {
printf(" %s\n", path.c_str()); printf(" %s\n", path.c_str());
}; };
printf(" control_video_path: %s\n", params.control_video_path.c_str()); printf(" control_video_path: %s\n", params.control_video_path.c_str());
printf(" auto_resize_ref_image: %s\n", params.auto_resize_ref_image ? "true" : "false");
printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false"); printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false");
printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false"); printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false");
printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false"); printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false");
@ -211,118 +214,6 @@ void print_params(SDParams params) {
free(high_noise_sample_params_str); free(high_noise_sample_params_str);
} }
void print_usage(int argc, const char* argv[]) {
printf("usage: %s [arguments]\n", argv[0]);
printf("\n");
printf("arguments:\n");
printf(" -h, --help show this help message and exit\n");
printf(" -M, --mode [MODE] run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen\n");
printf(" -t, --threads N number of threads to use during computation (default: -1)\n");
printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
printf(" --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed\n");
printf(" -m, --model [MODEL] path to full model\n");
printf(" --diffusion-model path to the standalone diffusion model\n");
printf(" --high-noise-diffusion-model path to the standalone high noise diffusion model\n");
printf(" --clip_l path to the clip-l text encoder\n");
printf(" --clip_g path to the clip-g text encoder\n");
printf(" --clip_vision path to the clip-vision encoder\n");
printf(" --t5xxl path to the t5xxl text encoder\n");
printf(" --qwen2vl path to the qwen2vl text encoder\n");
printf(" --qwen2vl_vision path to the qwen2vl vit\n");
printf(" --vae [VAE] path to vae\n");
printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
printf(" --control-net [CONTROL_PATH] path to control net model\n");
printf(" --embd-dir [EMBEDDING_PATH] path to embeddings\n");
printf(" --upscale-model [ESRGAN_PATH] path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n");
printf(" --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n");
printf(" If not specified, the default is the type of the weight file\n");
printf(" --tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n");
printf(" --lora-model-dir [DIR] lora model directory\n");
printf(" -i, --init-img [IMAGE] path to the init image, required by img2img\n");
printf(" --mask [MASK] path to the mask image, required by img2img with mask\n");
printf(" -i, --end-img [IMAGE] path to the end image, required by flf2v\n");
printf(" --control-image [IMAGE] path to image condition, control net\n");
printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n");
printf(" --control-video [PATH] path to control video frames, It must be a directory path.\n");
printf(" The video frames inside should be stored as images in lexicographical (character) order\n");
printf(" For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, … etc.\n");
printf(" --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");
printf(" -p, --prompt [PROMPT] the prompt to render\n");
printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n");
printf(" --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
printf(" --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)\n");
printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n");
printf(" --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n");
printf(" --skip-layer-start START SLG enabling point: (default: 0.01)\n");
printf(" --skip-layer-end END SLG disabling point: (default: 0.2)\n");
printf(" --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n");
printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
printf(" sampling method (default: \"euler\" for Flux/SD3/Wan, \"euler_a\" otherwise)\n");
printf(" --timestep-shift N shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant\n");
printf(" --steps STEPS number of sample steps (default: 20)\n");
printf(" --high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0)\n");
printf(" --high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
printf(" --high-noise-guidance SCALE (high noise) distilled guidance scale for models with guidance input (default: 3.5)\n");
printf(" --high-noise-slg-scale SCALE (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
printf(" --high-noise-eta SCALE (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)\n");
printf(" --high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9])\n");
printf(" --high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01)\n");
printf(" --high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2)\n");
printf(" --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n");
printf(" --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
printf(" (high noise) sampling method (default: \"euler_a\")\n");
printf(" --high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)\n");
printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n");
printf(" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n");
printf(" 1.0 corresponds to full destruction of information in init image\n");
printf(" -H, --height H image height, in pixel space (default: 512)\n");
printf(" -W, --width W image width, in pixel space (default: 512)\n");
printf(" --rng {std_default, cuda} RNG (default: cuda)\n");
printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n");
printf(" -b, --batch-count COUNT number of images to generate\n");
printf(" --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override.\n");
printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
printf(" --vae-tiling process vae in tiles to reduce memory usage\n");
printf(" --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)\n");
printf(" --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n");
printf(" --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n");
printf(" --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae\n");
printf(" --vae-on-cpu keep vae in cpu (for low vram)\n");
printf(" --clip-on-cpu keep clip in cpu (for low vram)\n");
printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n");
printf(" Might lower quality, since it implies converting k and v to f16.\n");
printf(" This might crash if it is not supported by the backend.\n");
printf(" --diffusion-conv-direct use Conv2d direct in the diffusion model\n");
printf(" This might crash if it is not supported by the backend.\n");
printf(" --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)\n");
printf(" This might crash if it is not supported by the backend.\n");
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
printf(" --canny apply canny preprocessor (edge detection)\n");
printf(" --color colors the logging tags according to level\n");
printf(" --chroma-disable-dit-mask disable dit mask for chroma\n");
printf(" --chroma-enable-t5-mask enable t5 mask for chroma\n");
printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n");
printf(" --video-frames video frames (default: 1)\n");
printf(" --fps fps (default: 24)\n");
printf(" --moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)\n");
printf(" only enabled if `--high-noise-steps` is set to -1\n");
printf(" --flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)\n");
printf(" --vace-strength wan vace strength\n");
printf(" --photo-maker path to PHOTOMAKER model\n");
printf(" --pm-id-images-dir [DIR] path to PHOTOMAKER input id images dir\n");
printf(" --pm-id-embed-path [PATH] path to PHOTOMAKER v2 id embed\n");
printf(" --pm-style-strength strength for keeping PHOTOMAKER input identity (default: 20)\n");
printf(" -v, --verbose print extra info\n");
}
#if defined(_WIN32) #if defined(_WIN32)
static std::string utf16_to_utf8(const std::wstring& wstr) { static std::string utf16_to_utf8(const std::wstring& wstr) {
if (wstr.empty()) if (wstr.empty())
@ -492,93 +383,424 @@ bool parse_options(int argc, const char** argv, ArgOptions& options) {
return true; return true;
} }
static std::string wrap_text(const std::string& text, size_t width, size_t indent) {
std::ostringstream oss;
size_t line_len = 0;
size_t pos = 0;
while (pos < text.size()) {
// Preserve manual newlines
if (text[pos] == '\n') {
oss << '\n'
<< std::string(indent, ' ');
line_len = indent;
++pos;
continue;
}
// Add the character
oss << text[pos];
++line_len;
++pos;
// If the current line exceeds width, try to break at the last space
if (line_len >= width) {
std::string current = oss.str();
size_t back = current.size();
// Find the last space (for a clean break)
while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n')
--back;
// If found a space to break on
if (back > 0 && current[back - 1] != '\n') {
std::string before = current.substr(0, back - 1);
std::string after = current.substr(back);
oss.str("");
oss.clear();
oss << before << "\n"
<< std::string(indent, ' ') << after;
} else {
// If no space found, just break at width
oss << "\n"
<< std::string(indent, ' ');
}
line_len = indent;
}
}
return oss.str();
}
void print_usage(int argc, const char* argv[], const ArgOptions& options) {
constexpr size_t max_line_width = 120;
std::cout << "Usage: " << argv[0] << " [options]\n\n";
std::cout << "Options:\n";
struct Entry {
std::string names;
std::string desc;
};
std::vector<Entry> entries;
auto add_entry = [&](const std::string& s, const std::string& l,
const std::string& desc, const std::string& hint = "") {
std::ostringstream ss;
if (!s.empty())
ss << s;
if (!s.empty() && !l.empty())
ss << ", ";
if (!l.empty())
ss << l;
if (!hint.empty())
ss << " " << hint;
entries.push_back({ss.str(), desc});
};
for (auto& o : options.string_options)
add_entry(o.short_name, o.long_name, o.desc, "<string>");
for (auto& o : options.int_options)
add_entry(o.short_name, o.long_name, o.desc, "<int>");
for (auto& o : options.float_options)
add_entry(o.short_name, o.long_name, o.desc, "<float>");
for (auto& o : options.bool_options)
add_entry(o.short_name, o.long_name, o.desc, "");
for (auto& o : options.manual_options)
add_entry(o.short_name, o.long_name, o.desc);
size_t max_name_width = 0;
for (auto& e : entries)
max_name_width = std::max(max_name_width, e.names.size());
for (auto& e : entries) {
size_t indent = 2 + max_name_width + 4;
size_t desc_width = (max_line_width > indent ? max_line_width - indent : 40);
std::string wrapped_desc = wrap_text(e.desc, max_line_width, indent);
std::cout << " " << std::left << std::setw(static_cast<int>(max_name_width) + 4)
<< e.names << wrapped_desc << "\n";
}
}
void parse_args(int argc, const char** argv, SDParams& params) { void parse_args(int argc, const char** argv, SDParams& params) {
ArgOptions options; ArgOptions options;
options.string_options = { options.string_options = {
{"-m", "--model", "", &params.model_path}, {"-m",
{"", "--clip_l", "", &params.clip_l_path}, "--model",
{"", "--clip_g", "", &params.clip_g_path}, "path to full model",
{"", "--clip_vision", "", &params.clip_vision_path}, &params.model_path},
{"", "--t5xxl", "", &params.t5xxl_path}, {"",
{"", "--qwen2vl", "", &params.qwen2vl_path}, "--clip_l",
{"", "--qwen2vl_vision", "", &params.qwen2vl_vision_path}, "path to the clip-l text encoder", &params.clip_l_path},
{"", "--diffusion-model", "", &params.diffusion_model_path}, {"", "--clip_g",
{"", "--high-noise-diffusion-model", "", &params.high_noise_diffusion_model_path}, "path to the clip-g text encoder",
{"", "--vae", "", &params.vae_path}, &params.clip_g_path},
{"", "--taesd", "", &params.taesd_path}, {"",
{"", "--control-net", "", &params.control_net_path}, "--clip_vision",
{"", "--embd-dir", "", &params.embedding_dir}, "path to the clip-vision encoder",
{"", "--lora-model-dir", "", &params.lora_model_dir}, &params.clip_vision_path},
{"-i", "--init-img", "", &params.init_image_path}, {"",
{"", "--end-img", "", &params.end_image_path}, "--t5xxl",
{"", "--tensor-type-rules", "", &params.tensor_type_rules}, "path to the t5xxl text encoder",
{"", "--photo-maker", "", &params.photo_maker_path}, &params.t5xxl_path},
{"", "--pm-id-images-dir", "", &params.pm_id_images_dir}, {"",
{"", "--pm-id-embed-path", "", &params.pm_id_embed_path}, "--qwen2vl",
{"", "--mask", "", &params.mask_image_path}, "path to the qwen2vl text encoder",
{"", "--control-image", "", &params.control_image_path}, &params.qwen2vl_path},
{"", "--control-video", "", &params.control_video_path}, {"",
{"-o", "--output", "", &params.output_path}, "--qwen2vl_vision",
{"-p", "--prompt", "", &params.prompt}, "path to the qwen2vl vit",
{"-n", "--negative-prompt", "", &params.negative_prompt}, &params.qwen2vl_vision_path},
{"", "--upscale-model", "", &params.esrgan_path}, {"",
"--diffusion-model",
"path to the standalone diffusion model",
&params.diffusion_model_path},
{"",
"--high-noise-diffusion-model",
"path to the standalone high noise diffusion model",
&params.high_noise_diffusion_model_path},
{"",
"--vae",
"path to standalone vae model",
&params.vae_path},
{"",
"--taesd",
"path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
&params.taesd_path},
{"",
"--control-net",
"path to control net model",
&params.control_net_path},
{"",
"--embd-dir",
"embeddings directory",
&params.embedding_dir},
{"",
"--lora-model-dir",
"lora model directory",
&params.lora_model_dir},
{"-i",
"--init-img",
"path to the init image",
&params.init_image_path},
{"",
"--end-img",
"path to the end image, required by flf2v",
&params.end_image_path},
{"",
"--tensor-type-rules",
"weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
&params.tensor_type_rules},
{"",
"--photo-maker",
"path to PHOTOMAKER model",
&params.photo_maker_path},
{"",
"--pm-id-images-dir",
"path to PHOTOMAKER input id images dir",
&params.pm_id_images_dir},
{"",
"--pm-id-embed-path",
"path to PHOTOMAKER v2 id embed",
&params.pm_id_embed_path},
{"",
"--mask",
"path to the mask image",
&params.mask_image_path},
{"",
"--control-image",
"path to control image, control net",
&params.control_image_path},
{"",
"--control-video",
"path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
"lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
"such as 00.png, 01.png, ... etc.",
&params.control_video_path},
{"-o",
"--output",
"path to write result image to (default: ./output.png)",
&params.output_path},
{"-p",
"--prompt",
"the prompt to render",
&params.prompt},
{"-n",
"--negative-prompt",
"the negative prompt (default: \"\")",
&params.negative_prompt},
{"",
"--upscale-model",
"path to esrgan model.",
&params.esrgan_path},
}; };
options.int_options = { options.int_options = {
{"-t", "--threads", "", &params.n_threads}, {"-t",
{"", "--upscale-repeats", "", &params.upscale_repeats}, "--threads",
{"-H", "--height", "", &params.height}, "number of threads to use during computation (default: -1). "
{"-W", "--width", "", &params.width}, "If threads <= 0, then threads will be set to the number of CPU physical cores",
{"", "--steps", "", &params.sample_params.sample_steps}, &params.n_threads},
{"", "--high-noise-steps", "", &params.high_noise_sample_params.sample_steps}, {"",
{"", "--clip-skip", "", &params.clip_skip}, "--upscale-repeats",
{"-b", "--batch-count", "", &params.batch_count}, "Run the ESRGAN upscaler this many times (default: 1)",
{"", "--chroma-t5-mask-pad", "", &params.chroma_t5_mask_pad}, &params.upscale_repeats},
{"", "--video-frames", "", &params.video_frames}, {"-H",
{"", "--fps", "", &params.fps}, "--height",
{"", "--timestep-shift", "", &params.sample_params.shifted_timestep}, "image height, in pixel space (default: 512)",
&params.height},
{"-W",
"--width",
"image width, in pixel space (default: 512)",
&params.width},
{"",
"--steps",
"number of sample steps (default: 20)",
&params.sample_params.sample_steps},
{"",
"--high-noise-steps",
"(high noise) number of sample steps (default: -1 = auto)",
&params.high_noise_sample_params.sample_steps},
{"",
"--clip-skip",
"ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). "
"<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x",
&params.clip_skip},
{"-b",
"--batch-count",
"batch count",
&params.batch_count},
{"",
"--chroma-t5-mask-pad",
"t5 mask pad size of chroma",
&params.chroma_t5_mask_pad},
{"",
"--video-frames",
"video frames (default: 1)",
&params.video_frames},
{"",
"--fps",
"fps (default: 24)",
&params.fps},
{"",
"--timestep-shift",
"shift timestep for NitroFusion models (default: 0). "
"recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant",
&params.sample_params.shifted_timestep},
}; };
options.float_options = { options.float_options = {
{"", "--cfg-scale", "", &params.sample_params.guidance.txt_cfg}, {"",
{"", "--img-cfg-scale", "", &params.sample_params.guidance.img_cfg}, "--cfg-scale",
{"", "--guidance", "", &params.sample_params.guidance.distilled_guidance}, "unconditional guidance scale: (default: 7.0)",
{"", "--slg-scale", "", &params.sample_params.guidance.slg.scale}, &params.sample_params.guidance.txt_cfg},
{"", "--skip-layer-start", "", &params.sample_params.guidance.slg.layer_start}, {"",
{"", "--skip-layer-end", "", &params.sample_params.guidance.slg.layer_end}, "--img-cfg-scale",
{"", "--eta", "", &params.sample_params.eta}, "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)",
{"", "--high-noise-cfg-scale", "", &params.high_noise_sample_params.guidance.txt_cfg}, &params.sample_params.guidance.img_cfg},
{"", "--high-noise-img-cfg-scale", "", &params.high_noise_sample_params.guidance.img_cfg}, {"",
{"", "--high-noise-guidance", "", &params.high_noise_sample_params.guidance.distilled_guidance}, "--guidance",
{"", "--high-noise-slg-scale", "", &params.high_noise_sample_params.guidance.slg.scale}, "distilled guidance scale for models with guidance input (default: 3.5)",
{"", "--high-noise-skip-layer-start", "", &params.high_noise_sample_params.guidance.slg.layer_start}, &params.sample_params.guidance.distilled_guidance},
{"", "--high-noise-skip-layer-end", "", &params.high_noise_sample_params.guidance.slg.layer_end}, {"",
{"", "--high-noise-eta", "", &params.high_noise_sample_params.eta}, "--slg-scale",
{"", "--strength", "", &params.strength}, "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium",
{"", "--pm-style-strength", "", &params.pm_style_strength}, &params.sample_params.guidance.slg.scale},
{"", "--control-strength", "", &params.control_strength}, {"",
{"", "--moe-boundary", "", &params.moe_boundary}, "--skip-layer-start",
{"", "--flow-shift", "", &params.flow_shift}, "SLG enabling point (default: 0.01)",
{"", "--vace-strength", "", &params.vace_strength}, &params.sample_params.guidance.slg.layer_start},
{"", "--vae-tile-overlap", "", &params.vae_tiling_params.target_overlap}, {"",
"--skip-layer-end",
"SLG disabling point (default: 0.2)",
&params.sample_params.guidance.slg.layer_end},
{"",
"--eta",
"eta in DDIM, only for DDIM and TCD (default: 0)",
&params.sample_params.eta},
{"",
"--high-noise-cfg-scale",
"(high noise) unconditional guidance scale: (default: 7.0)",
&params.high_noise_sample_params.guidance.txt_cfg},
{"",
"--high-noise-img-cfg-scale",
"(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)",
&params.high_noise_sample_params.guidance.img_cfg},
{"",
"--high-noise-guidance",
"(high noise) distilled guidance scale for models with guidance input (default: 3.5)",
&params.high_noise_sample_params.guidance.distilled_guidance},
{"",
"--high-noise-slg-scale",
"(high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)",
&params.high_noise_sample_params.guidance.slg.scale},
{"",
"--high-noise-skip-layer-start",
"(high noise) SLG enabling point (default: 0.01)",
&params.high_noise_sample_params.guidance.slg.layer_start},
{"",
"--high-noise-skip-layer-end",
"(high noise) SLG disabling point (default: 0.2)",
&params.high_noise_sample_params.guidance.slg.layer_end},
{"",
"--high-noise-eta",
"(high noise) eta in DDIM, only for DDIM and TCD (default: 0)",
&params.high_noise_sample_params.eta},
{"",
"--strength",
"strength for noising/unnoising (default: 0.75)",
&params.strength},
{"",
"--pm-style-strength",
"",
&params.pm_style_strength},
{"",
"--control-strength",
"strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image",
&params.control_strength},
{"",
"--moe-boundary",
"timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1",
&params.moe_boundary},
{"",
"--flow-shift",
"shift value for Flow models like SD3.x or WAN (default: auto)",
&params.flow_shift},
{"",
"--vace-strength",
"wan vace strength",
&params.vace_strength},
{"",
"--vae-tile-overlap",
"tile overlap for vae tiling, in fraction of tile size (default: 0.5)",
&params.vae_tiling_params.target_overlap},
}; };
options.bool_options = { options.bool_options = {
{"", "--vae-tiling", "", true, &params.vae_tiling_params.enabled}, {"",
{"", "--force-sdxl-vae-conv-scale", "", true, &params.force_sdxl_vae_conv_scale}, "--vae-tiling",
{"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu}, "process vae in tiles to reduce memory usage",
{"", "--control-net-cpu", "", true, &params.control_net_cpu}, true, &params.vae_tiling_params.enabled},
{"", "--clip-on-cpu", "", true, &params.clip_on_cpu}, {"",
{"", "--vae-on-cpu", "", true, &params.vae_on_cpu}, "--force-sdxl-vae-conv-scale",
{"", "--diffusion-fa", "", true, &params.diffusion_flash_attn}, "force use of conv scale on sdxl vae",
{"", "--diffusion-conv-direct", "", true, &params.diffusion_conv_direct}, true, &params.force_sdxl_vae_conv_scale},
{"", "--vae-conv-direct", "", true, &params.vae_conv_direct}, {"",
{"", "--canny", "", true, &params.canny_preprocess}, "--offload-to-cpu",
{"-v", "--verbose", "", true, &params.verbose}, "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed",
{"", "--color", "", true, &params.color}, true, &params.offload_params_to_cpu},
{"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask}, {"",
{"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask}, "--control-net-cpu",
{"", "--increase-ref-index", "", true, &params.increase_ref_index}, "keep controlnet in cpu (for low vram)",
true, &params.control_net_cpu},
{"",
"--clip-on-cpu",
"keep clip in cpu (for low vram)",
true, &params.clip_on_cpu},
{"",
"--vae-on-cpu",
"keep vae in cpu (for low vram)",
true, &params.vae_on_cpu},
{"",
"--diffusion-fa",
"use flash attention in the diffusion model",
true, &params.diffusion_flash_attn},
{"",
"--diffusion-conv-direct",
"use ggml_conv2d_direct in the diffusion model",
true, &params.diffusion_conv_direct},
{"",
"--vae-conv-direct",
"use ggml_conv2d_direct in the vae model",
true, &params.vae_conv_direct},
{"",
"--canny",
"apply canny preprocessor (edge detection)",
true, &params.canny_preprocess},
{"-v",
"--verbose",
"print extra info",
true, &params.verbose},
{"",
"--color",
"colors the logging tags according to level",
true, &params.color},
{"",
"--chroma-disable-dit-mask",
"disable dit mask for chroma",
false, &params.chroma_use_dit_mask},
{"",
"--chroma-enable-t5-mask",
"enable t5 mask for chroma",
true, &params.chroma_use_t5_mask},
{"",
"--increase-ref-index",
"automatically increase the indices of references images based on the order they are listed (starting with 1).",
true, &params.increase_ref_index},
{"",
"--disable-auto-resize-ref-image",
"disable auto resize of ref images",
false, &params.auto_resize_ref_image},
}; };
auto on_mode_arg = [&](int argc, const char** argv, int index) { auto on_mode_arg = [&](int argc, const char** argv, int index) {
@ -711,7 +933,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
}; };
auto on_help_arg = [&](int argc, const char** argv, int index) { auto on_help_arg = [&](int argc, const char** argv, int index) {
print_usage(argc, argv); print_usage(argc, argv, options);
exit(0); exit(0);
return 0; return 0;
}; };
@ -825,25 +1047,73 @@ void parse_args(int argc, const char** argv, SDParams& params) {
}; };
options.manual_options = { options.manual_options = {
{"-M", "--mode", "", on_mode_arg}, {"-M",
{"", "--type", "", on_type_arg}, "--mode",
{"", "--rng", "", on_rng_arg}, "run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen",
{"-s", "--seed", "", on_seed_arg}, on_mode_arg},
{"", "--sampling-method", "", on_sample_method_arg}, {"",
{"", "--prediction", "", on_prediction_arg}, "--type",
{"", "--scheduler", "", on_schedule_arg}, "weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). "
{"", "--skip-layers", "", on_skip_layers_arg}, "If not specified, the default is the type of the weight file",
{"", "--high-noise-sampling-method", "", on_high_noise_sample_method_arg}, on_type_arg},
{"", "--high-noise-scheduler", "", on_high_noise_schedule_arg}, {"",
{"", "--high-noise-skip-layers", "", on_high_noise_skip_layers_arg}, "--rng",
{"-r", "--ref-image", "", on_ref_image_arg}, "RNG, one of [std_default, cuda], default: cuda",
{"-h", "--help", "", on_help_arg}, on_rng_arg},
{"", "--vae-tile-size", "", on_tile_size_arg}, {"-s",
{"", "--vae-relative-tile-size", "", on_relative_tile_size_arg}, "--seed",
"RNG seed (default: 42, use random seed for < 0)",
on_seed_arg},
{"",
"--sampling-method",
"sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] "
"(default: euler for Flux/SD3/Wan, euler_a otherwise)",
on_sample_method_arg},
{"",
"--prediction",
"prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]",
on_prediction_arg},
{"",
"--scheduler",
"denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete",
on_schedule_arg},
{"",
"--skip-layers",
"layers to skip for SLG steps (default: [7,8,9])",
on_skip_layers_arg},
{"",
"--high-noise-sampling-method",
"(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]"
" default: euler for Flux/SD3/Wan, euler_a otherwise",
on_high_noise_sample_method_arg},
{"",
"--high-noise-scheduler",
"(high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete",
on_high_noise_schedule_arg},
{"",
"--high-noise-skip-layers",
"(high noise) layers to skip for SLG steps (default: [7,8,9])",
on_high_noise_skip_layers_arg},
{"-r",
"--ref-image",
"reference image for Flux Kontext models (can be used multiple times)",
on_ref_image_arg},
{"-h",
"--help",
"show this help message and exit",
on_help_arg},
{"",
"--vae-tile-size",
"tile size for vae tiling, format [X]x[Y] (default: 32x32)",
on_tile_size_arg},
{"",
"--vae-relative-tile-size",
"relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
on_relative_tile_size_arg},
}; };
if (!parse_options(argc, argv, options)) { if (!parse_options(argc, argv, options)) {
print_usage(argc, argv); print_usage(argc, argv, options);
exit(1); exit(1);
} }
@ -853,19 +1123,19 @@ void parse_args(int argc, const char** argv, SDParams& params) {
if ((params.mode == IMG_GEN || params.mode == VID_GEN) && params.prompt.length() == 0) { if ((params.mode == IMG_GEN || params.mode == VID_GEN) && params.prompt.length() == 0) {
fprintf(stderr, "error: the following arguments are required: prompt\n"); fprintf(stderr, "error: the following arguments are required: prompt\n");
print_usage(argc, argv); print_usage(argc, argv, options);
exit(1); exit(1);
} }
if (params.mode != UPSCALE && params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) { if (params.mode != UPSCALE && params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) {
fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n"); fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
print_usage(argc, argv); print_usage(argc, argv, options);
exit(1); exit(1);
} }
if (params.output_path.length() == 0) { if (params.output_path.length() == 0) {
fprintf(stderr, "error: the following arguments are required: output_path\n"); fprintf(stderr, "error: the following arguments are required: output_path\n");
print_usage(argc, argv); print_usage(argc, argv, options);
exit(1); exit(1);
} }
@ -1428,6 +1698,7 @@ int main(int argc, const char* argv[]) {
init_image, init_image,
ref_images.data(), ref_images.data(),
(int)ref_images.size(), (int)ref_images.size(),
params.auto_resize_ref_image,
params.increase_ref_index, params.increase_ref_index,
mask_image, mask_image,
params.width, params.width,

View File

@ -1970,6 +1970,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
"seed: %" PRId64 "seed: %" PRId64
"batch_count: %d\n" "batch_count: %d\n"
"ref_images_count: %d\n" "ref_images_count: %d\n"
"auto_resize_ref_image: %s\n"
"increase_ref_index: %s\n" "increase_ref_index: %s\n"
"control_strength: %.2f\n" "control_strength: %.2f\n"
"photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n" "photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n"
@ -1984,6 +1985,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
sd_img_gen_params->seed, sd_img_gen_params->seed,
sd_img_gen_params->batch_count, sd_img_gen_params->batch_count,
sd_img_gen_params->ref_images_count, sd_img_gen_params->ref_images_count,
BOOL_STR(sd_img_gen_params->auto_resize_ref_image),
BOOL_STR(sd_img_gen_params->increase_ref_index), BOOL_STR(sd_img_gen_params->increase_ref_index),
sd_img_gen_params->control_strength, sd_img_gen_params->control_strength,
sd_img_gen_params->pm_params.style_strength, sd_img_gen_params->pm_params.style_strength,
@ -2624,14 +2626,20 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
std::vector<ggml_tensor*> ref_latents; std::vector<ggml_tensor*> ref_latents;
for (int i = 0; i < ref_images.size(); i++) { for (int i = 0; i < ref_images.size(); i++) {
ggml_tensor* img; ggml_tensor* img;
if (sd_version_is_qwen_image(sd_ctx->sd->version)) { if (sd_img_gen_params->auto_resize_ref_image) {
LOG_DEBUG("auto resize ref images");
sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]); sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]);
int VAE_IMAGE_SIZE = std::min(1024 * 1024, width * height); int VAE_IMAGE_SIZE = std::min(1024 * 1024, width * height);
double vae_width = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height); double vae_width = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height);
double vae_height = vae_width * ref_image.height / ref_image.width; double vae_height = vae_width * ref_image.height / ref_image.width;
vae_height = round(vae_height / 32) * 32; int factor = 16;
vae_width = round(vae_width / 32) * 32; if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
factor = 32;
}
vae_height = round(vae_height / factor) * factor;
vae_width = round(vae_width / factor) * factor;
sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height)); sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height));
free(ref_image.data); free(ref_image.data);

View File

@ -216,6 +216,7 @@ typedef struct {
sd_image_t init_image; sd_image_t init_image;
sd_image_t* ref_images; sd_image_t* ref_images;
int ref_images_count; int ref_images_count;
bool auto_resize_ref_image;
bool increase_ref_index; bool increase_ref_index;
sd_image_t mask_image; sd_image_t mask_image;
int width; int width;