Compare commits

..

No commits in common. "0723ee51c9f8ec84af1384591d8899d77beed315" and "db6f4791b4a41b909a6138bc90e122dc85284de2" have entirely different histories.

4 changed files with 319 additions and 596 deletions

View File

@ -1,110 +1,113 @@
# Run # Run
``` ```
usage: ./bin/sd [options] usage: ./bin/sd [arguments]
Options: arguments:
-m, --model <string> path to full model -h, --help show this help message and exit
--clip_l <string> path to the clip-l text encoder -M, --mode [MODE] run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen
--clip_g <string> path to the clip-g text encoder -t, --threads N number of threads to use during computation (default: -1)
--clip_vision <string> path to the clip-vision encoder If threads <= 0, then threads will be set to the number of CPU physical cores
--t5xxl <string> path to the t5xxl text encoder --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--qwen2vl <string> path to the qwen2vl text encoder -m, --model [MODEL] path to full model
--qwen2vl_vision <string> path to the qwen2vl vit --diffusion-model path to the standalone diffusion model
--diffusion-model <string> path to the standalone diffusion model --high-noise-diffusion-model path to the standalone high noise diffusion model
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model --clip_l path to the clip-l text encoder
--vae <string> path to standalone vae model --clip_g path to the clip-g text encoder
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality) --clip_vision path to the clip-vision encoder
--control-net <string> path to control net model --t5xxl path to the t5xxl text encoder
--embd-dir <string> embeddings directory --qwen2vl path to the qwen2vl text encoder
--lora-model-dir <string> lora model directory --qwen2vl_vision path to the qwen2vl vit
-i, --init-img <string> path to the init image --vae [VAE] path to vae
--end-img <string> path to the end image, required by flf2v --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --control-net [CONTROL_PATH] path to control net model
--photo-maker <string> path to PHOTOMAKER model --embd-dir [EMBEDDING_PATH] path to embeddings
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir --upscale-model [ESRGAN_PATH] path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed --upscale-repeats Run the ESRGAN upscaler this many times (default 1)
--mask <string> path to the mask image --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
--control-image <string> path to control image, control net If not specified, the default is the type of the weight file
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in --tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
lexicographical (character) order. For example, if the control video path is --lora-model-dir [DIR] lora model directory
`frames`, the directory contain images such as 00.png, 01.png, ... etc. -i, --init-img [IMAGE] path to the init image, required by img2img
-o, --output <string> path to write result image to (default: ./output.png) --mask [MASK] path to the mask image, required by img2img with mask
-p, --prompt <string> the prompt to render -i, --end-img [IMAGE] path to the end image, required by flf2v
-n, --negative-prompt <string> the negative prompt (default: "") --control-image [IMAGE] path to image condition, control net
--upscale-model <string> path to esrgan model. -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times)
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of --control-video [PATH] path to control video frames, It must be a directory path.
CPU physical cores The video frames inside should be stored as images in lexicographical (character) order
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1) For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc.
-H, --height <int> image height, in pixel space (default: 512) --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
-W, --width <int> image width, in pixel space (default: 512) -o, --output OUTPUT path to write result image to (default: ./output.png)
--steps <int> number of sample steps (default: 20) -p, --prompt [PROMPT] the prompt to render
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto) -n, --negative-prompt PROMPT the negative prompt (default: "")
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified, --cfg-scale SCALE unconditional guidance scale: (default: 7.0)
will be 1 for SD1.x, 2 for SD2.x --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
-b, --batch-count <int> batch count --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)
--video-frames <int> video frames (default: 1) 0 means disabled, a value of 2.5 is nice for sd3.5 medium
--fps <int> fps (default: 24) --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
NitroSD-Vibrant --skip-layer-start START SLG enabling point: (default: 0.01)
--cfg-scale <float> unconditional guidance scale: (default: 7.0) --skip-layer-end END SLG disabling point: (default: 0.2)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5) --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise)
medium --timestep-shift N shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
--skip-layer-start <float> SLG enabling point (default: 0.01) --steps STEPS number of sample steps (default: 20)
--skip-layer-end <float> SLG disabling point (default: 0.2) --high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0)
--eta <float> eta in DDIM, only for DDIM and TCD (default: 0) --high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0) --high-noise-guidance SCALE (high noise) distilled guidance scale for models with guidance input (default: 3.5)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale) --high-noise-slg-scale SCALE (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5) 0 means disabled, a value of 2.5 is nice for sd3.5 medium
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0) --high-noise-eta SCALE (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01) --high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9])
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2) --high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01)
--high-noise-eta <float> (high noise) eta in DDIM, only for DDIM and TCD (default: 0) --high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2)
--strength <float> strength for noising/unnoising (default: 0.75) --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
--pm-style-strength <float> --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image (high noise) sampling method (default: "euler_a")
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1 --high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto) SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
--vace-strength <float> wan vace strength --strength STRENGTH strength for noising/unnoising (default: 0.75)
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5) --control-strength STRENGTH strength to apply Control Net (default: 0.9)
--vae-tiling process vae in tiles to reduce memory usage 1.0 corresponds to full destruction of information in init image
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae -H, --height H image height, in pixel space (default: 512)
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed -W, --width W image width, in pixel space (default: 512)
--control-net-cpu keep controlnet in cpu (for low vram) --rng {std_default, cuda} RNG (default: cuda)
--clip-on-cpu keep clip in cpu (for low vram) -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
--vae-on-cpu keep vae in cpu (for low vram) -b, --batch-count COUNT number of images to generate
--diffusion-fa use flash attention in the diffusion model --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
--vae-conv-direct use ggml_conv2d_direct in the vae model <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
--canny apply canny preprocessor (edge detection) --vae-tiling process vae in tiles to reduce memory usage
-v, --verbose print extra info --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)
--color colors the logging tags according to level --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
--chroma-disable-dit-mask disable dit mask for chroma --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--chroma-enable-t5-mask enable t5 mask for chroma --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). --vae-on-cpu keep vae in cpu (for low vram)
--disable-auto-resize-ref-image disable auto resize of ref images --clip-on-cpu keep clip in cpu (for low vram)
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen --diffusion-fa use flash attention in the diffusion model (for low vram)
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the Might lower quality, since it implies converting k and v to f16.
type of the weight file This might crash if it is not supported by the backend.
--rng RNG, one of [std_default, cuda], default: cuda --diffusion-conv-direct use Conv2d direct in the diffusion model
-s, --seed RNG seed (default: 42, use random seed for < 0) This might crash if it is not supported by the backend.
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)
tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) This might crash if it is not supported by the backend.
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow] --control-net-cpu keep controlnet in cpu (for low vram)
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: --canny apply canny preprocessor (edge detection)
discrete --color colors the logging tags according to level
--skip-layers layers to skip for SLG steps (default: [7,8,9]) --chroma-disable-dit-mask disable dit mask for chroma
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, --chroma-enable-t5-mask enable t5 mask for chroma
ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma
--high-noise-scheduler (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, --video-frames video frames (default: 1)
simple], default: discrete --fps fps (default: 24)
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) only enabled if `--high-noise-steps` is set to -1
-h, --help show this help message and exit --flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) --vace-strength wan vace strength
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 --photo-maker path to PHOTOMAKER model
(overrides --vae-tile-size) --pm-id-images-dir [DIR] path to PHOTOMAKER input id images dir
--pm-id-embed-path [PATH] path to PHOTOMAKER v2 id embed
--pm-style-strength strength for keeping PHOTOMAKER input identity (default: 20)
-v, --verbose print extra info
``` ```

View File

@ -7,7 +7,6 @@
#include <map> #include <map>
#include <random> #include <random>
#include <regex> #include <regex>
#include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
@ -81,8 +80,7 @@ struct SDParams {
std::string control_image_path; std::string control_image_path;
std::vector<std::string> ref_image_paths; std::vector<std::string> ref_image_paths;
std::string control_video_path; std::string control_video_path;
bool auto_resize_ref_image = true; bool increase_ref_index = false;
bool increase_ref_index = false;
std::string prompt; std::string prompt;
std::string negative_prompt; std::string negative_prompt;
@ -177,7 +175,6 @@ void print_params(SDParams params) {
printf(" %s\n", path.c_str()); printf(" %s\n", path.c_str());
}; };
printf(" control_video_path: %s\n", params.control_video_path.c_str()); printf(" control_video_path: %s\n", params.control_video_path.c_str());
printf(" auto_resize_ref_image: %s\n", params.auto_resize_ref_image ? "true" : "false");
printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false"); printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false");
printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false"); printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false");
printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false"); printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false");
@ -214,6 +211,118 @@ void print_params(SDParams params) {
free(high_noise_sample_params_str); free(high_noise_sample_params_str);
} }
void print_usage(int argc, const char* argv[]) {
printf("usage: %s [arguments]\n", argv[0]);
printf("\n");
printf("arguments:\n");
printf(" -h, --help show this help message and exit\n");
printf(" -M, --mode [MODE] run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen\n");
printf(" -t, --threads N number of threads to use during computation (default: -1)\n");
printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
printf(" --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed\n");
printf(" -m, --model [MODEL] path to full model\n");
printf(" --diffusion-model path to the standalone diffusion model\n");
printf(" --high-noise-diffusion-model path to the standalone high noise diffusion model\n");
printf(" --clip_l path to the clip-l text encoder\n");
printf(" --clip_g path to the clip-g text encoder\n");
printf(" --clip_vision path to the clip-vision encoder\n");
printf(" --t5xxl path to the t5xxl text encoder\n");
printf(" --qwen2vl path to the qwen2vl text encoder\n");
printf(" --qwen2vl_vision path to the qwen2vl vit\n");
printf(" --vae [VAE] path to vae\n");
printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
printf(" --control-net [CONTROL_PATH] path to control net model\n");
printf(" --embd-dir [EMBEDDING_PATH] path to embeddings\n");
printf(" --upscale-model [ESRGAN_PATH] path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n");
printf(" --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n");
printf(" If not specified, the default is the type of the weight file\n");
printf(" --tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n");
printf(" --lora-model-dir [DIR] lora model directory\n");
printf(" -i, --init-img [IMAGE] path to the init image, required by img2img\n");
printf(" --mask [MASK] path to the mask image, required by img2img with mask\n");
printf(" -i, --end-img [IMAGE] path to the end image, required by flf2v\n");
printf(" --control-image [IMAGE] path to image condition, control net\n");
printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n");
printf(" --control-video [PATH] path to control video frames, It must be a directory path.\n");
printf(" The video frames inside should be stored as images in lexicographical (character) order\n");
printf(" For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, … etc.\n");
printf(" --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");
printf(" -p, --prompt [PROMPT] the prompt to render\n");
printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n");
printf(" --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
printf(" --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)\n");
printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n");
printf(" --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n");
printf(" --skip-layer-start START SLG enabling point: (default: 0.01)\n");
printf(" --skip-layer-end END SLG disabling point: (default: 0.2)\n");
printf(" --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n");
printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
printf(" sampling method (default: \"euler\" for Flux/SD3/Wan, \"euler_a\" otherwise)\n");
printf(" --timestep-shift N shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant\n");
printf(" --steps STEPS number of sample steps (default: 20)\n");
printf(" --high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0)\n");
printf(" --high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
printf(" --high-noise-guidance SCALE (high noise) distilled guidance scale for models with guidance input (default: 3.5)\n");
printf(" --high-noise-slg-scale SCALE (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
printf(" --high-noise-eta SCALE (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)\n");
printf(" --high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9])\n");
printf(" --high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01)\n");
printf(" --high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2)\n");
printf(" --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n");
printf(" --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
printf(" (high noise) sampling method (default: \"euler_a\")\n");
printf(" --high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)\n");
printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n");
printf(" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n");
printf(" 1.0 corresponds to full destruction of information in init image\n");
printf(" -H, --height H image height, in pixel space (default: 512)\n");
printf(" -W, --width W image width, in pixel space (default: 512)\n");
printf(" --rng {std_default, cuda} RNG (default: cuda)\n");
printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n");
printf(" -b, --batch-count COUNT number of images to generate\n");
printf(" --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override.\n");
printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
printf(" --vae-tiling process vae in tiles to reduce memory usage\n");
printf(" --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)\n");
printf(" --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n");
printf(" --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n");
printf(" --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae\n");
printf(" --vae-on-cpu keep vae in cpu (for low vram)\n");
printf(" --clip-on-cpu keep clip in cpu (for low vram)\n");
printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n");
printf(" Might lower quality, since it implies converting k and v to f16.\n");
printf(" This might crash if it is not supported by the backend.\n");
printf(" --diffusion-conv-direct use Conv2d direct in the diffusion model\n");
printf(" This might crash if it is not supported by the backend.\n");
printf(" --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)\n");
printf(" This might crash if it is not supported by the backend.\n");
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
printf(" --canny apply canny preprocessor (edge detection)\n");
printf(" --color colors the logging tags according to level\n");
printf(" --chroma-disable-dit-mask disable dit mask for chroma\n");
printf(" --chroma-enable-t5-mask enable t5 mask for chroma\n");
printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n");
printf(" --video-frames video frames (default: 1)\n");
printf(" --fps fps (default: 24)\n");
printf(" --moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)\n");
printf(" only enabled if `--high-noise-steps` is set to -1\n");
printf(" --flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)\n");
printf(" --vace-strength wan vace strength\n");
printf(" --photo-maker path to PHOTOMAKER model\n");
printf(" --pm-id-images-dir [DIR] path to PHOTOMAKER input id images dir\n");
printf(" --pm-id-embed-path [PATH] path to PHOTOMAKER v2 id embed\n");
printf(" --pm-style-strength strength for keeping PHOTOMAKER input identity (default: 20)\n");
printf(" -v, --verbose print extra info\n");
}
#if defined(_WIN32) #if defined(_WIN32)
static std::string utf16_to_utf8(const std::wstring& wstr) { static std::string utf16_to_utf8(const std::wstring& wstr) {
if (wstr.empty()) if (wstr.empty())
@ -383,424 +492,93 @@ bool parse_options(int argc, const char** argv, ArgOptions& options) {
return true; return true;
} }
static std::string wrap_text(const std::string& text, size_t width, size_t indent) {
std::ostringstream oss;
size_t line_len = 0;
size_t pos = 0;
while (pos < text.size()) {
// Preserve manual newlines
if (text[pos] == '\n') {
oss << '\n'
<< std::string(indent, ' ');
line_len = indent;
++pos;
continue;
}
// Add the character
oss << text[pos];
++line_len;
++pos;
// If the current line exceeds width, try to break at the last space
if (line_len >= width) {
std::string current = oss.str();
size_t back = current.size();
// Find the last space (for a clean break)
while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n')
--back;
// If found a space to break on
if (back > 0 && current[back - 1] != '\n') {
std::string before = current.substr(0, back - 1);
std::string after = current.substr(back);
oss.str("");
oss.clear();
oss << before << "\n"
<< std::string(indent, ' ') << after;
} else {
// If no space found, just break at width
oss << "\n"
<< std::string(indent, ' ');
}
line_len = indent;
}
}
return oss.str();
}
void print_usage(int argc, const char* argv[], const ArgOptions& options) {
constexpr size_t max_line_width = 120;
std::cout << "Usage: " << argv[0] << " [options]\n\n";
std::cout << "Options:\n";
struct Entry {
std::string names;
std::string desc;
};
std::vector<Entry> entries;
auto add_entry = [&](const std::string& s, const std::string& l,
const std::string& desc, const std::string& hint = "") {
std::ostringstream ss;
if (!s.empty())
ss << s;
if (!s.empty() && !l.empty())
ss << ", ";
if (!l.empty())
ss << l;
if (!hint.empty())
ss << " " << hint;
entries.push_back({ss.str(), desc});
};
for (auto& o : options.string_options)
add_entry(o.short_name, o.long_name, o.desc, "<string>");
for (auto& o : options.int_options)
add_entry(o.short_name, o.long_name, o.desc, "<int>");
for (auto& o : options.float_options)
add_entry(o.short_name, o.long_name, o.desc, "<float>");
for (auto& o : options.bool_options)
add_entry(o.short_name, o.long_name, o.desc, "");
for (auto& o : options.manual_options)
add_entry(o.short_name, o.long_name, o.desc);
size_t max_name_width = 0;
for (auto& e : entries)
max_name_width = std::max(max_name_width, e.names.size());
for (auto& e : entries) {
size_t indent = 2 + max_name_width + 4;
size_t desc_width = (max_line_width > indent ? max_line_width - indent : 40);
std::string wrapped_desc = wrap_text(e.desc, max_line_width, indent);
std::cout << " " << std::left << std::setw(static_cast<int>(max_name_width) + 4)
<< e.names << wrapped_desc << "\n";
}
}
void parse_args(int argc, const char** argv, SDParams& params) { void parse_args(int argc, const char** argv, SDParams& params) {
ArgOptions options; ArgOptions options;
options.string_options = { options.string_options = {
{"-m", {"-m", "--model", "", &params.model_path},
"--model", {"", "--clip_l", "", &params.clip_l_path},
"path to full model", {"", "--clip_g", "", &params.clip_g_path},
&params.model_path}, {"", "--clip_vision", "", &params.clip_vision_path},
{"", {"", "--t5xxl", "", &params.t5xxl_path},
"--clip_l", {"", "--qwen2vl", "", &params.qwen2vl_path},
"path to the clip-l text encoder", &params.clip_l_path}, {"", "--qwen2vl_vision", "", &params.qwen2vl_vision_path},
{"", "--clip_g", {"", "--diffusion-model", "", &params.diffusion_model_path},
"path to the clip-g text encoder", {"", "--high-noise-diffusion-model", "", &params.high_noise_diffusion_model_path},
&params.clip_g_path}, {"", "--vae", "", &params.vae_path},
{"", {"", "--taesd", "", &params.taesd_path},
"--clip_vision", {"", "--control-net", "", &params.control_net_path},
"path to the clip-vision encoder", {"", "--embd-dir", "", &params.embedding_dir},
&params.clip_vision_path}, {"", "--lora-model-dir", "", &params.lora_model_dir},
{"", {"-i", "--init-img", "", &params.init_image_path},
"--t5xxl", {"", "--end-img", "", &params.end_image_path},
"path to the t5xxl text encoder", {"", "--tensor-type-rules", "", &params.tensor_type_rules},
&params.t5xxl_path}, {"", "--photo-maker", "", &params.photo_maker_path},
{"", {"", "--pm-id-images-dir", "", &params.pm_id_images_dir},
"--qwen2vl", {"", "--pm-id-embed-path", "", &params.pm_id_embed_path},
"path to the qwen2vl text encoder", {"", "--mask", "", &params.mask_image_path},
&params.qwen2vl_path}, {"", "--control-image", "", &params.control_image_path},
{"", {"", "--control-video", "", &params.control_video_path},
"--qwen2vl_vision", {"-o", "--output", "", &params.output_path},
"path to the qwen2vl vit", {"-p", "--prompt", "", &params.prompt},
&params.qwen2vl_vision_path}, {"-n", "--negative-prompt", "", &params.negative_prompt},
{"", {"", "--upscale-model", "", &params.esrgan_path},
"--diffusion-model",
"path to the standalone diffusion model",
&params.diffusion_model_path},
{"",
"--high-noise-diffusion-model",
"path to the standalone high noise diffusion model",
&params.high_noise_diffusion_model_path},
{"",
"--vae",
"path to standalone vae model",
&params.vae_path},
{"",
"--taesd",
"path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
&params.taesd_path},
{"",
"--control-net",
"path to control net model",
&params.control_net_path},
{"",
"--embd-dir",
"embeddings directory",
&params.embedding_dir},
{"",
"--lora-model-dir",
"lora model directory",
&params.lora_model_dir},
{"-i",
"--init-img",
"path to the init image",
&params.init_image_path},
{"",
"--end-img",
"path to the end image, required by flf2v",
&params.end_image_path},
{"",
"--tensor-type-rules",
"weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
&params.tensor_type_rules},
{"",
"--photo-maker",
"path to PHOTOMAKER model",
&params.photo_maker_path},
{"",
"--pm-id-images-dir",
"path to PHOTOMAKER input id images dir",
&params.pm_id_images_dir},
{"",
"--pm-id-embed-path",
"path to PHOTOMAKER v2 id embed",
&params.pm_id_embed_path},
{"",
"--mask",
"path to the mask image",
&params.mask_image_path},
{"",
"--control-image",
"path to control image, control net",
&params.control_image_path},
{"",
"--control-video",
"path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
"lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
"such as 00.png, 01.png, ... etc.",
&params.control_video_path},
{"-o",
"--output",
"path to write result image to (default: ./output.png)",
&params.output_path},
{"-p",
"--prompt",
"the prompt to render",
&params.prompt},
{"-n",
"--negative-prompt",
"the negative prompt (default: \"\")",
&params.negative_prompt},
{"",
"--upscale-model",
"path to esrgan model.",
&params.esrgan_path},
}; };
options.int_options = { options.int_options = {
{"-t", {"-t", "--threads", "", &params.n_threads},
"--threads", {"", "--upscale-repeats", "", &params.upscale_repeats},
"number of threads to use during computation (default: -1). " {"-H", "--height", "", &params.height},
"If threads <= 0, then threads will be set to the number of CPU physical cores", {"-W", "--width", "", &params.width},
&params.n_threads}, {"", "--steps", "", &params.sample_params.sample_steps},
{"", {"", "--high-noise-steps", "", &params.high_noise_sample_params.sample_steps},
"--upscale-repeats", {"", "--clip-skip", "", &params.clip_skip},
"Run the ESRGAN upscaler this many times (default: 1)", {"-b", "--batch-count", "", &params.batch_count},
&params.upscale_repeats}, {"", "--chroma-t5-mask-pad", "", &params.chroma_t5_mask_pad},
{"-H", {"", "--video-frames", "", &params.video_frames},
"--height", {"", "--fps", "", &params.fps},
"image height, in pixel space (default: 512)", {"", "--timestep-shift", "", &params.sample_params.shifted_timestep},
&params.height},
{"-W",
"--width",
"image width, in pixel space (default: 512)",
&params.width},
{"",
"--steps",
"number of sample steps (default: 20)",
&params.sample_params.sample_steps},
{"",
"--high-noise-steps",
"(high noise) number of sample steps (default: -1 = auto)",
&params.high_noise_sample_params.sample_steps},
{"",
"--clip-skip",
"ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). "
"<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x",
&params.clip_skip},
{"-b",
"--batch-count",
"batch count",
&params.batch_count},
{"",
"--chroma-t5-mask-pad",
"t5 mask pad size of chroma",
&params.chroma_t5_mask_pad},
{"",
"--video-frames",
"video frames (default: 1)",
&params.video_frames},
{"",
"--fps",
"fps (default: 24)",
&params.fps},
{"",
"--timestep-shift",
"shift timestep for NitroFusion models (default: 0). "
"recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant",
&params.sample_params.shifted_timestep},
}; };
options.float_options = { options.float_options = {
{"", {"", "--cfg-scale", "", &params.sample_params.guidance.txt_cfg},
"--cfg-scale", {"", "--img-cfg-scale", "", &params.sample_params.guidance.img_cfg},
"unconditional guidance scale: (default: 7.0)", {"", "--guidance", "", &params.sample_params.guidance.distilled_guidance},
&params.sample_params.guidance.txt_cfg}, {"", "--slg-scale", "", &params.sample_params.guidance.slg.scale},
{"", {"", "--skip-layer-start", "", &params.sample_params.guidance.slg.layer_start},
"--img-cfg-scale", {"", "--skip-layer-end", "", &params.sample_params.guidance.slg.layer_end},
"image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)", {"", "--eta", "", &params.sample_params.eta},
&params.sample_params.guidance.img_cfg}, {"", "--high-noise-cfg-scale", "", &params.high_noise_sample_params.guidance.txt_cfg},
{"", {"", "--high-noise-img-cfg-scale", "", &params.high_noise_sample_params.guidance.img_cfg},
"--guidance", {"", "--high-noise-guidance", "", &params.high_noise_sample_params.guidance.distilled_guidance},
"distilled guidance scale for models with guidance input (default: 3.5)", {"", "--high-noise-slg-scale", "", &params.high_noise_sample_params.guidance.slg.scale},
&params.sample_params.guidance.distilled_guidance}, {"", "--high-noise-skip-layer-start", "", &params.high_noise_sample_params.guidance.slg.layer_start},
{"", {"", "--high-noise-skip-layer-end", "", &params.high_noise_sample_params.guidance.slg.layer_end},
"--slg-scale", {"", "--high-noise-eta", "", &params.high_noise_sample_params.eta},
"skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium", {"", "--strength", "", &params.strength},
&params.sample_params.guidance.slg.scale}, {"", "--pm-style-strength", "", &params.pm_style_strength},
{"", {"", "--control-strength", "", &params.control_strength},
"--skip-layer-start", {"", "--moe-boundary", "", &params.moe_boundary},
"SLG enabling point (default: 0.01)", {"", "--flow-shift", "", &params.flow_shift},
&params.sample_params.guidance.slg.layer_start}, {"", "--vace-strength", "", &params.vace_strength},
{"", {"", "--vae-tile-overlap", "", &params.vae_tiling_params.target_overlap},
"--skip-layer-end",
"SLG disabling point (default: 0.2)",
&params.sample_params.guidance.slg.layer_end},
{"",
"--eta",
"eta in DDIM, only for DDIM and TCD (default: 0)",
&params.sample_params.eta},
{"",
"--high-noise-cfg-scale",
"(high noise) unconditional guidance scale: (default: 7.0)",
&params.high_noise_sample_params.guidance.txt_cfg},
{"",
"--high-noise-img-cfg-scale",
"(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)",
&params.high_noise_sample_params.guidance.img_cfg},
{"",
"--high-noise-guidance",
"(high noise) distilled guidance scale for models with guidance input (default: 3.5)",
&params.high_noise_sample_params.guidance.distilled_guidance},
{"",
"--high-noise-slg-scale",
"(high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)",
&params.high_noise_sample_params.guidance.slg.scale},
{"",
"--high-noise-skip-layer-start",
"(high noise) SLG enabling point (default: 0.01)",
&params.high_noise_sample_params.guidance.slg.layer_start},
{"",
"--high-noise-skip-layer-end",
"(high noise) SLG disabling point (default: 0.2)",
&params.high_noise_sample_params.guidance.slg.layer_end},
{"",
"--high-noise-eta",
"(high noise) eta in DDIM, only for DDIM and TCD (default: 0)",
&params.high_noise_sample_params.eta},
{"",
"--strength",
"strength for noising/unnoising (default: 0.75)",
&params.strength},
{"",
"--pm-style-strength",
"",
&params.pm_style_strength},
{"",
"--control-strength",
"strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image",
&params.control_strength},
{"",
"--moe-boundary",
"timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1",
&params.moe_boundary},
{"",
"--flow-shift",
"shift value for Flow models like SD3.x or WAN (default: auto)",
&params.flow_shift},
{"",
"--vace-strength",
"wan vace strength",
&params.vace_strength},
{"",
"--vae-tile-overlap",
"tile overlap for vae tiling, in fraction of tile size (default: 0.5)",
&params.vae_tiling_params.target_overlap},
}; };
options.bool_options = { options.bool_options = {
{"", {"", "--vae-tiling", "", true, &params.vae_tiling_params.enabled},
"--vae-tiling", {"", "--force-sdxl-vae-conv-scale", "", true, &params.force_sdxl_vae_conv_scale},
"process vae in tiles to reduce memory usage", {"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu},
true, &params.vae_tiling_params.enabled}, {"", "--control-net-cpu", "", true, &params.control_net_cpu},
{"", {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
"--force-sdxl-vae-conv-scale", {"", "--vae-on-cpu", "", true, &params.vae_on_cpu},
"force use of conv scale on sdxl vae", {"", "--diffusion-fa", "", true, &params.diffusion_flash_attn},
true, &params.force_sdxl_vae_conv_scale}, {"", "--diffusion-conv-direct", "", true, &params.diffusion_conv_direct},
{"", {"", "--vae-conv-direct", "", true, &params.vae_conv_direct},
"--offload-to-cpu", {"", "--canny", "", true, &params.canny_preprocess},
"place the weights in RAM to save VRAM, and automatically load them into VRAM when needed", {"-v", "--verbose", "", true, &params.verbose},
true, &params.offload_params_to_cpu}, {"", "--color", "", true, &params.color},
{"", {"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
"--control-net-cpu", {"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
"keep controlnet in cpu (for low vram)", {"", "--increase-ref-index", "", true, &params.increase_ref_index},
true, &params.control_net_cpu},
{"",
"--clip-on-cpu",
"keep clip in cpu (for low vram)",
true, &params.clip_on_cpu},
{"",
"--vae-on-cpu",
"keep vae in cpu (for low vram)",
true, &params.vae_on_cpu},
{"",
"--diffusion-fa",
"use flash attention in the diffusion model",
true, &params.diffusion_flash_attn},
{"",
"--diffusion-conv-direct",
"use ggml_conv2d_direct in the diffusion model",
true, &params.diffusion_conv_direct},
{"",
"--vae-conv-direct",
"use ggml_conv2d_direct in the vae model",
true, &params.vae_conv_direct},
{"",
"--canny",
"apply canny preprocessor (edge detection)",
true, &params.canny_preprocess},
{"-v",
"--verbose",
"print extra info",
true, &params.verbose},
{"",
"--color",
"colors the logging tags according to level",
true, &params.color},
{"",
"--chroma-disable-dit-mask",
"disable dit mask for chroma",
false, &params.chroma_use_dit_mask},
{"",
"--chroma-enable-t5-mask",
"enable t5 mask for chroma",
true, &params.chroma_use_t5_mask},
{"",
"--increase-ref-index",
"automatically increase the indices of references images based on the order they are listed (starting with 1).",
true, &params.increase_ref_index},
{"",
"--disable-auto-resize-ref-image",
"disable auto resize of ref images",
false, &params.auto_resize_ref_image},
}; };
auto on_mode_arg = [&](int argc, const char** argv, int index) { auto on_mode_arg = [&](int argc, const char** argv, int index) {
@ -933,7 +711,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
}; };
auto on_help_arg = [&](int argc, const char** argv, int index) { auto on_help_arg = [&](int argc, const char** argv, int index) {
print_usage(argc, argv, options); print_usage(argc, argv);
exit(0); exit(0);
return 0; return 0;
}; };
@ -1047,73 +825,25 @@ void parse_args(int argc, const char** argv, SDParams& params) {
}; };
options.manual_options = { options.manual_options = {
{"-M", {"-M", "--mode", "", on_mode_arg},
"--mode", {"", "--type", "", on_type_arg},
"run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen", {"", "--rng", "", on_rng_arg},
on_mode_arg}, {"-s", "--seed", "", on_seed_arg},
{"", {"", "--sampling-method", "", on_sample_method_arg},
"--type", {"", "--prediction", "", on_prediction_arg},
"weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). " {"", "--scheduler", "", on_schedule_arg},
"If not specified, the default is the type of the weight file", {"", "--skip-layers", "", on_skip_layers_arg},
on_type_arg}, {"", "--high-noise-sampling-method", "", on_high_noise_sample_method_arg},
{"", {"", "--high-noise-scheduler", "", on_high_noise_schedule_arg},
"--rng", {"", "--high-noise-skip-layers", "", on_high_noise_skip_layers_arg},
"RNG, one of [std_default, cuda], default: cuda", {"-r", "--ref-image", "", on_ref_image_arg},
on_rng_arg}, {"-h", "--help", "", on_help_arg},
{"-s", {"", "--vae-tile-size", "", on_tile_size_arg},
"--seed", {"", "--vae-relative-tile-size", "", on_relative_tile_size_arg},
"RNG seed (default: 42, use random seed for < 0)",
on_seed_arg},
{"",
"--sampling-method",
"sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] "
"(default: euler for Flux/SD3/Wan, euler_a otherwise)",
on_sample_method_arg},
{"",
"--prediction",
"prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]",
on_prediction_arg},
{"",
"--scheduler",
"denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete",
on_schedule_arg},
{"",
"--skip-layers",
"layers to skip for SLG steps (default: [7,8,9])",
on_skip_layers_arg},
{"",
"--high-noise-sampling-method",
"(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]"
" default: euler for Flux/SD3/Wan, euler_a otherwise",
on_high_noise_sample_method_arg},
{"",
"--high-noise-scheduler",
"(high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete",
on_high_noise_schedule_arg},
{"",
"--high-noise-skip-layers",
"(high noise) layers to skip for SLG steps (default: [7,8,9])",
on_high_noise_skip_layers_arg},
{"-r",
"--ref-image",
"reference image for Flux Kontext models (can be used multiple times)",
on_ref_image_arg},
{"-h",
"--help",
"show this help message and exit",
on_help_arg},
{"",
"--vae-tile-size",
"tile size for vae tiling, format [X]x[Y] (default: 32x32)",
on_tile_size_arg},
{"",
"--vae-relative-tile-size",
"relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
on_relative_tile_size_arg},
}; };
if (!parse_options(argc, argv, options)) { if (!parse_options(argc, argv, options)) {
print_usage(argc, argv, options); print_usage(argc, argv);
exit(1); exit(1);
} }
@ -1123,19 +853,19 @@ void parse_args(int argc, const char** argv, SDParams& params) {
if ((params.mode == IMG_GEN || params.mode == VID_GEN) && params.prompt.length() == 0) { if ((params.mode == IMG_GEN || params.mode == VID_GEN) && params.prompt.length() == 0) {
fprintf(stderr, "error: the following arguments are required: prompt\n"); fprintf(stderr, "error: the following arguments are required: prompt\n");
print_usage(argc, argv, options); print_usage(argc, argv);
exit(1); exit(1);
} }
if (params.mode != UPSCALE && params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) { if (params.mode != UPSCALE && params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) {
fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n"); fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
print_usage(argc, argv, options); print_usage(argc, argv);
exit(1); exit(1);
} }
if (params.output_path.length() == 0) { if (params.output_path.length() == 0) {
fprintf(stderr, "error: the following arguments are required: output_path\n"); fprintf(stderr, "error: the following arguments are required: output_path\n");
print_usage(argc, argv, options); print_usage(argc, argv);
exit(1); exit(1);
} }
@ -1698,7 +1428,6 @@ int main(int argc, const char* argv[]) {
init_image, init_image,
ref_images.data(), ref_images.data(),
(int)ref_images.size(), (int)ref_images.size(),
params.auto_resize_ref_image,
params.increase_ref_index, params.increase_ref_index,
mask_image, mask_image,
params.width, params.width,

View File

@ -1970,7 +1970,6 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
"seed: %" PRId64 "seed: %" PRId64
"batch_count: %d\n" "batch_count: %d\n"
"ref_images_count: %d\n" "ref_images_count: %d\n"
"auto_resize_ref_image: %s\n"
"increase_ref_index: %s\n" "increase_ref_index: %s\n"
"control_strength: %.2f\n" "control_strength: %.2f\n"
"photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n" "photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n"
@ -1985,7 +1984,6 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
sd_img_gen_params->seed, sd_img_gen_params->seed,
sd_img_gen_params->batch_count, sd_img_gen_params->batch_count,
sd_img_gen_params->ref_images_count, sd_img_gen_params->ref_images_count,
BOOL_STR(sd_img_gen_params->auto_resize_ref_image),
BOOL_STR(sd_img_gen_params->increase_ref_index), BOOL_STR(sd_img_gen_params->increase_ref_index),
sd_img_gen_params->control_strength, sd_img_gen_params->control_strength,
sd_img_gen_params->pm_params.style_strength, sd_img_gen_params->pm_params.style_strength,
@ -2626,20 +2624,14 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
std::vector<ggml_tensor*> ref_latents; std::vector<ggml_tensor*> ref_latents;
for (int i = 0; i < ref_images.size(); i++) { for (int i = 0; i < ref_images.size(); i++) {
ggml_tensor* img; ggml_tensor* img;
if (sd_img_gen_params->auto_resize_ref_image) { if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
LOG_DEBUG("auto resize ref images");
sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]); sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]);
int VAE_IMAGE_SIZE = std::min(1024 * 1024, width * height); int VAE_IMAGE_SIZE = std::min(1024 * 1024, width * height);
double vae_width = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height); double vae_width = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height);
double vae_height = vae_width * ref_image.height / ref_image.width; double vae_height = vae_width * ref_image.height / ref_image.width;
int factor = 16; vae_height = round(vae_height / 32) * 32;
if (sd_version_is_qwen_image(sd_ctx->sd->version)) { vae_width = round(vae_width / 32) * 32;
factor = 32;
}
vae_height = round(vae_height / factor) * factor;
vae_width = round(vae_width / factor) * factor;
sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height)); sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height));
free(ref_image.data); free(ref_image.data);

View File

@ -216,7 +216,6 @@ typedef struct {
sd_image_t init_image; sd_image_t init_image;
sd_image_t* ref_images; sd_image_t* ref_images;
int ref_images_count; int ref_images_count;
bool auto_resize_ref_image;
bool increase_ref_index; bool increase_ref_index;
sd_image_t mask_image; sd_image_t mask_image;
int width; int width;