#include "common.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(_WIN32) #define NOMINMAX #include #endif // _WIN32 #include "log.h" #include "media_io.h" #include "resource_owners.hpp" using json = nlohmann::json; namespace fs = std::filesystem; const char* const modes_str[] = { "img_gen", "vid_gen", "convert", "upscale", "metadata", }; #if defined(_WIN32) static std::string utf16_to_utf8(const std::wstring& wstr) { if (wstr.empty()) return {}; int size_needed = WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(), nullptr, 0, nullptr, nullptr); if (size_needed <= 0) throw std::runtime_error("UTF-16 to UTF-8 conversion failed"); std::string utf8(size_needed, 0); WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(), (char*)utf8.data(), size_needed, nullptr, nullptr); return utf8; } static std::string argv_to_utf8(int index, const char** argv) { (void)argv; int argc; wchar_t** argv_w = CommandLineToArgvW(GetCommandLineW(), &argc); if (!argv_w) throw std::runtime_error("Failed to parse command line"); std::string result; if (index < argc) { result = utf16_to_utf8(argv_w[index]); } LocalFree(argv_w); return result; } #else // Linux / macOS static std::string argv_to_utf8(int index, const char** argv) { return std::string(argv[index]); } #endif template static std::string vec_to_string(const std::vector& v) { std::ostringstream oss; oss << "["; for (size_t i = 0; i < v.size(); i++) { oss << v[i]; if (i + 1 < v.size()) oss << ", "; } oss << "]"; return oss.str(); } static std::string vec_str_to_string(const std::vector& v) { std::ostringstream oss; oss << "["; for (size_t i = 0; i < v.size(); i++) { oss << "\"" << v[i] << "\""; if (i + 1 < v.size()) oss << ", "; } oss << "]"; return oss.str(); } static bool is_absolute_path(const std::string& p) { #ifdef _WIN32 return p.size() > 1 && std::isalpha(static_cast(p[0])) && p[1] == ':'; #else return !p.empty() && p[0] == '/'; #endif } std::string ArgOptions::wrap_text(const std::string& text, size_t width, size_t indent) { std::ostringstream oss; size_t pos = 0; size_t line_len = 0; while (pos < text.size()) { if (text[pos] == '\n') { oss << '\n' << std::string(indent, ' '); line_len = 0; ++pos; continue; } if (std::isspace(static_cast(text[pos]))) { ++pos; continue; } size_t word_start = pos; while (pos < text.size() && text[pos] != '\n' && !std::isspace(static_cast(text[pos]))) { ++pos; } std::string word = text.substr(word_start, pos - word_start); while (!word.empty()) { size_t separator_len = line_len == 0 ? 0 : 1; if (line_len + separator_len + word.size() <= width) { if (separator_len > 0) { oss << ' '; ++line_len; } oss << word; line_len += word.size(); word.clear(); continue; } if (line_len > 0) { oss << '\n' << std::string(indent, ' '); line_len = 0; continue; } size_t chunk_len = std::min(width, word.size()); oss << word.substr(0, chunk_len); line_len = chunk_len; word.erase(0, chunk_len); if (!word.empty()) { oss << '\n' << std::string(indent, ' '); line_len = 0; } } } return oss.str(); } void ArgOptions::print() const { constexpr size_t max_line_width = 120; struct Entry { std::string names; std::string desc; }; std::vector entries; auto add_entry = [&](const std::string& s, const std::string& l, const std::string& desc, const std::string& hint = "") { std::ostringstream ss; if (!s.empty()) ss << s; if (!s.empty() && !l.empty()) ss << ", "; if (!l.empty()) ss << l; if (!hint.empty()) ss << " " << hint; entries.push_back({ss.str(), desc}); }; for (auto& o : string_options) add_entry(o.short_name, o.long_name, o.desc, ""); for (auto& o : int_options) add_entry(o.short_name, o.long_name, o.desc, ""); for (auto& o : float_options) add_entry(o.short_name, o.long_name, o.desc, ""); for (auto& o : bool_options) add_entry(o.short_name, o.long_name, o.desc, ""); for (auto& o : manual_options) add_entry(o.short_name, o.long_name, o.desc); size_t max_name_width = 0; for (auto& e : entries) max_name_width = std::max(max_name_width, e.names.size()); for (auto& e : entries) { size_t indent = 2 + max_name_width + 4; size_t desc_width = (max_line_width > indent ? max_line_width - indent : 40); std::string wrapped_desc = wrap_text(e.desc, desc_width, indent); std::cout << " " << std::left << std::setw(static_cast(max_name_width) + 4) << e.names << wrapped_desc << "\n"; } } bool parse_options(int argc, const char** argv, const std::vector& options_list) { bool invalid_arg = false; std::string arg; auto match_and_apply = [&](auto& opts, auto&& apply_fn) -> bool { for (auto& option : opts) { if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { apply_fn(option); return true; } } return false; }; for (int i = 1; i < argc; i++) { arg = argv[i]; bool found_arg = false; for (auto& options : options_list) { if (match_and_apply(options.string_options, [&](auto& option) { if (++i >= argc) { invalid_arg = true; return; } *option.target = argv_to_utf8(i, argv); found_arg = true; })) break; if (match_and_apply(options.int_options, [&](auto& option) { if (++i >= argc) { invalid_arg = true; return; } *option.target = std::stoi(argv[i]); found_arg = true; })) break; if (match_and_apply(options.float_options, [&](auto& option) { if (++i >= argc) { invalid_arg = true; return; } *option.target = std::stof(argv[i]); found_arg = true; })) break; if (match_and_apply(options.bool_options, [&](auto& option) { *option.target = option.keep_true ? true : false; found_arg = true; })) break; if (match_and_apply(options.manual_options, [&](auto& option) { int ret = option.cb(argc, argv, i); if (ret < 0) { invalid_arg = true; return; } i += ret; found_arg = true; })) break; } if (invalid_arg) { LOG_ERROR("error: invalid parameter for argument: %s", arg.c_str()); return false; } if (!found_arg) { LOG_ERROR("error: unknown argument: %s", arg.c_str()); return false; } } return true; } ArgOptions SDContextParams::get_options() { ArgOptions options; options.string_options = { {"-m", "--model", "path to full model", &model_path}, {"", "--clip_l", "path to the clip-l text encoder", &clip_l_path}, {"", "--clip_g", "path to the clip-g text encoder", &clip_g_path}, {"", "--clip_vision", "path to the clip-vision encoder", &clip_vision_path}, {"", "--t5xxl", "path to the t5xxl text encoder", &t5xxl_path}, {"", "--llm", "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)", &llm_path}, {"", "--llm_vision", "path to the llm vit", &llm_vision_path}, {"", "--qwen2vl", "alias of --llm. Deprecated.", &llm_path}, {"", "--qwen2vl_vision", "alias of --llm_vision. Deprecated.", &llm_vision_path}, {"", "--diffusion-model", "path to the standalone diffusion model", &diffusion_model_path}, {"", "--high-noise-diffusion-model", "path to the standalone high noise diffusion model", &high_noise_diffusion_model_path}, {"", "--embeddings-connectors", "path to LTXAV embeddings connectors", &embeddings_connectors_path}, {"", "--vae", "path to standalone vae model", &vae_path}, {"", "--taesd", "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)", &taesd_path}, {"", "--tae", "alias of --taesd", &taesd_path}, {"", "--control-net", "path to control net model", &control_net_path}, {"", "--embd-dir", "embeddings directory", &embedding_dir}, {"", "--lora-model-dir", "lora model directory", &lora_model_dir}, {"", "--hires-upscalers-dir", "highres fix upscaler model directory", &hires_upscalers_dir}, {"", "--tensor-type-rules", "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")", &tensor_type_rules}, {"", "--photo-maker", "path to PHOTOMAKER model", &photo_maker_path}, {"", "--upscale-model", "path to esrgan model.", &esrgan_path}, }; options.int_options = { {"-t", "--threads", "number of threads to use during computation (default: -1). " "If threads <= 0, then threads will be set to the number of CPU physical cores", &n_threads}, {"", "--chroma-t5-mask-pad", "t5 mask pad size of chroma", &chroma_t5_mask_pad}, }; options.float_options = { {"", "--max-vram", "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting", &max_vram}, }; options.bool_options = { {"", "--force-sdxl-vae-conv-scale", "force use of conv scale on sdxl vae", true, &force_sdxl_vae_conv_scale}, {"", "--offload-to-cpu", "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed", true, &offload_params_to_cpu}, {"", "--mmap", "whether to memory-map model", true, &enable_mmap}, {"", "--control-net-cpu", "keep controlnet in cpu (for low vram)", true, &control_net_cpu}, {"", "--clip-on-cpu", "keep clip in cpu (for low vram)", true, &clip_on_cpu}, {"", "--vae-on-cpu", "keep vae in cpu (for low vram)", true, &vae_on_cpu}, {"", "--fa", "use flash attention", true, &flash_attn}, {"", "--diffusion-fa", "use flash attention in the diffusion model only", true, &diffusion_flash_attn}, {"", "--diffusion-conv-direct", "use ggml_conv2d_direct in the diffusion model", true, &diffusion_conv_direct}, {"", "--vae-conv-direct", "use ggml_conv2d_direct in the vae model", true, &vae_conv_direct}, {"", "--circular", "enable circular padding for convolutions", true, &circular}, {"", "--circularx", "enable circular RoPE wrapping on x-axis (width) only", true, &circular_x}, {"", "--circulary", "enable circular RoPE wrapping on y-axis (height) only", true, &circular_y}, {"", "--chroma-disable-dit-mask", "disable dit mask for chroma", false, &chroma_use_dit_mask}, {"", "--qwen-image-zero-cond-t", "enable zero_cond_t for qwen image", true, &qwen_image_zero_cond_t}, {"", "--chroma-enable-t5-mask", "enable t5 mask for chroma", true, &chroma_use_t5_mask}, }; auto on_type_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* arg = argv[index]; wtype = str_to_sd_type(arg); if (wtype == SD_TYPE_COUNT) { LOG_ERROR("error: invalid weight format %s", arg); return -1; } return 1; }; auto on_rng_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* arg = argv[index]; rng_type = str_to_rng_type(arg); if (rng_type == RNG_TYPE_COUNT) { LOG_ERROR("error: invalid rng type %s", arg); return -1; } return 1; }; auto on_sampler_rng_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* arg = argv[index]; sampler_rng_type = str_to_rng_type(arg); if (sampler_rng_type == RNG_TYPE_COUNT) { LOG_ERROR("error: invalid sampler rng type %s", arg); return -1; } return 1; }; auto on_prediction_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* arg = argv[index]; prediction = str_to_prediction(arg); if (prediction == PREDICTION_COUNT) { LOG_ERROR("error: invalid prediction type %s", arg); return -1; } return 1; }; auto on_lora_apply_mode_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* arg = argv[index]; lora_apply_mode = str_to_lora_apply_mode(arg); if (lora_apply_mode == LORA_APPLY_MODE_COUNT) { LOG_ERROR("error: invalid lora apply model %s", arg); return -1; } return 1; }; options.manual_options = { {"", "--type", "weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). " "If not specified, the default is the type of the weight file", on_type_arg}, {"", "--rng", "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)", on_rng_arg}, {"", "--sampler-rng", "sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng", on_sampler_rng_arg}, {"", "--prediction", "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]", on_prediction_arg}, {"", "--lora-apply-mode", "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. " "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used." "The immediately mode may have precision and compatibility issues with quantized parameters, " "but it usually offers faster inference speed and, in some cases, lower memory usage. " "The at_runtime mode, on the other hand, is exactly the opposite.", on_lora_apply_mode_arg}, }; return options; } void SDContextParams::build_embedding_map() { static const std::vector valid_ext = {".gguf", ".safetensors", ".pt"}; if (!fs::exists(embedding_dir) || !fs::is_directory(embedding_dir)) { return; } for (auto& p : fs::directory_iterator(embedding_dir)) { if (!p.is_regular_file()) continue; auto path = p.path(); std::string ext = path.extension().string(); bool valid = false; for (auto& e : valid_ext) { if (ext == e) { valid = true; break; } } if (!valid) continue; std::string key = path.stem().string(); std::string value = path.string(); embedding_map[key] = value; } } bool SDContextParams::resolve(SDMode mode) { if (n_threads <= 0) { n_threads = sd_get_num_physical_cores(); } build_embedding_map(); return true; } bool SDContextParams::validate(SDMode mode) { if (mode != UPSCALE && mode != METADATA && model_path.length() == 0 && diffusion_model_path.length() == 0) { LOG_ERROR("error: the following arguments are required: model_path/diffusion_model\n"); return false; } if (mode == UPSCALE) { if (esrgan_path.length() == 0) { LOG_ERROR("error: upscale mode needs an upscaler model (--upscale-model)\n"); return false; } } return true; } bool SDContextParams::resolve_and_validate(SDMode mode) { if (!resolve(mode)) { return false; } if (!validate(mode)) { return false; } return true; } std::string SDContextParams::to_string() const { std::ostringstream emb_ss; emb_ss << "{\n"; for (auto it = embedding_map.begin(); it != embedding_map.end(); ++it) { emb_ss << " \"" << it->first << "\": \"" << it->second << "\""; if (std::next(it) != embedding_map.end()) { emb_ss << ","; } emb_ss << "\n"; } emb_ss << " }"; std::string embeddings_str = emb_ss.str(); std::ostringstream oss; oss << "SDContextParams {\n" << " n_threads: " << n_threads << ",\n" << " model_path: \"" << model_path << "\",\n" << " clip_l_path: \"" << clip_l_path << "\",\n" << " clip_g_path: \"" << clip_g_path << "\",\n" << " clip_vision_path: \"" << clip_vision_path << "\",\n" << " t5xxl_path: \"" << t5xxl_path << "\",\n" << " llm_path: \"" << llm_path << "\",\n" << " llm_vision_path: \"" << llm_vision_path << "\",\n" << " diffusion_model_path: \"" << diffusion_model_path << "\",\n" << " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n" << " embeddings_connectors_path: \"" << embeddings_connectors_path << "\",\n" << " vae_path: \"" << vae_path << "\",\n" << " taesd_path: \"" << taesd_path << "\",\n" << " esrgan_path: \"" << esrgan_path << "\",\n" << " control_net_path: \"" << control_net_path << "\",\n" << " embedding_dir: \"" << embedding_dir << "\",\n" << " embeddings: " << embeddings_str << "\n" << " wtype: " << sd_type_name(wtype) << ",\n" << " tensor_type_rules: \"" << tensor_type_rules << "\",\n" << " lora_model_dir: \"" << lora_model_dir << "\",\n" << " hires_upscalers_dir: \"" << hires_upscalers_dir << "\",\n" << " photo_maker_path: \"" << photo_maker_path << "\",\n" << " rng_type: " << sd_rng_type_name(rng_type) << ",\n" << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " max_vram: " << max_vram << ",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" << " flash_attn: " << (flash_attn ? "true" : "false") << ",\n" << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" << " vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n" << " circular: " << (circular ? "true" : "false") << ",\n" << " circular_x: " << (circular_x ? "true" : "false") << ",\n" << " circular_y: " << (circular_y ? "true" : "false") << ",\n" << " chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n" << " qwen_image_zero_cond_t: " << (qwen_image_zero_cond_t ? "true" : "false") << ",\n" << " chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n" << " chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n" << " prediction: " << sd_prediction_name(prediction) << ",\n" << " lora_apply_mode: " << sd_lora_apply_mode_name(lora_apply_mode) << ",\n" << " force_sdxl_vae_conv_scale: " << (force_sdxl_vae_conv_scale ? "true" : "false") << "\n" << "}"; return oss.str(); } sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) { embedding_vec.clear(); embedding_vec.reserve(embedding_map.size()); for (const auto& kv : embedding_map) { sd_embedding_t item; item.name = kv.first.c_str(); item.path = kv.second.c_str(); embedding_vec.emplace_back(item); } sd_ctx_params_t sd_ctx_params = { model_path.c_str(), clip_l_path.c_str(), clip_g_path.c_str(), clip_vision_path.c_str(), t5xxl_path.c_str(), llm_path.c_str(), llm_vision_path.c_str(), diffusion_model_path.c_str(), high_noise_diffusion_model_path.c_str(), embeddings_connectors_path.c_str(), vae_path.c_str(), taesd_path.c_str(), control_net_path.c_str(), embedding_vec.data(), static_cast(embedding_vec.size()), photo_maker_path.c_str(), tensor_type_rules.c_str(), vae_decode_only, free_params_immediately, n_threads, wtype, rng_type, sampler_rng_type, prediction, lora_apply_mode, offload_params_to_cpu, enable_mmap, clip_on_cpu, control_net_cpu, vae_on_cpu, flash_attn, diffusion_flash_attn, taesd_preview, diffusion_conv_direct, vae_conv_direct, circular || circular_x, circular || circular_y, force_sdxl_vae_conv_scale, chroma_use_dit_mask, chroma_use_t5_mask, chroma_t5_mask_pad, qwen_image_zero_cond_t, max_vram, }; return sd_ctx_params; } SDGenerationParams::SDGenerationParams() { sd_sample_params_init(&sample_params); sd_sample_params_init(&high_noise_sample_params); } ArgOptions SDGenerationParams::get_options() { ArgOptions options; options.string_options = { {"-p", "--prompt", "the prompt to render", &prompt}, {"-n", "--negative-prompt", "the negative prompt (default: \"\")", &negative_prompt}, {"-i", "--init-img", "path to the init image", &init_image_path}, {"", "--end-img", "path to the end image, required by flf2v", &end_image_path}, {"", "--mask", "path to the mask image", &mask_image_path}, {"", "--control-image", "path to control image, control net", &control_image_path}, {"", "--control-video", "path to control video frames, It must be a directory path. The video frames inside should be stored as images in " "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images " "such as 00.png, 01.png, ... etc.", &control_video_path}, {"", "--pm-id-images-dir", "path to PHOTOMAKER input id images dir", &pm_id_images_dir}, {"", "--pm-id-embed-path", "path to PHOTOMAKER v2 id embed", &pm_id_embed_path}, {"", "--hires-upscaler", "highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), " "Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name " "under --hires-upscalers-dir (default: Latent)", &hires_upscaler}, }; options.int_options = { {"-H", "--height", "image height, in pixel space (default: 512)", &height}, {"-W", "--width", "image width, in pixel space (default: 512)", &width}, {"", "--steps", "number of sample steps (default: 20)", &sample_params.sample_steps}, {"", "--high-noise-steps", "(high noise) number of sample steps (default: -1 = auto)", &high_noise_sample_params.sample_steps}, {"", "--clip-skip", "ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). " "<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x", &clip_skip}, {"-b", "--batch-count", "batch count", &batch_count}, {"", "--video-frames", "video frames (default: 1)", &video_frames}, {"", "--fps", "fps (default: 24)", &fps}, {"", "--timestep-shift", "shift timestep for NitroFusion models (default: 0). " "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant", &sample_params.shifted_timestep}, {"", "--upscale-repeats", "Run the ESRGAN upscaler this many times (default: 1)", &upscale_repeats}, {"", "--upscale-tile-size", "tile size for ESRGAN upscaling (default: 128)", &upscale_tile_size}, {"", "--hires-width", "highres fix target width, 0 to use --hires-scale (default: 0)", &hires_width}, {"", "--hires-height", "highres fix target height, 0 to use --hires-scale (default: 0)", &hires_height}, {"", "--hires-steps", "highres fix second pass sample steps, 0 to reuse --steps (default: 0)", &hires_steps}, {"", "--hires-upscale-tile-size", "highres fix upscaler tile size, reserved for model-backed upscalers (default: 128)", &hires_upscale_tile_size}, }; options.float_options = { {"", "--cfg-scale", "unconditional guidance scale: (default: 7.0)", &sample_params.guidance.txt_cfg}, {"", "--img-cfg-scale", "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)", &sample_params.guidance.img_cfg}, {"", "--guidance", "distilled guidance scale for models with guidance input (default: 3.5)", &sample_params.guidance.distilled_guidance}, {"", "--slg-scale", "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium", &sample_params.guidance.slg.scale}, {"", "--skip-layer-start", "SLG enabling point (default: 0.01)", &sample_params.guidance.slg.layer_start}, {"", "--skip-layer-end", "SLG disabling point (default: 0.2)", &sample_params.guidance.slg.layer_end}, {"", "--eta", "noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)", &sample_params.eta}, {"", "--flow-shift", "shift value for Flow models like SD3.x or WAN (default: auto)", &sample_params.flow_shift}, {"", "--high-noise-cfg-scale", "(high noise) unconditional guidance scale: (default: 7.0)", &high_noise_sample_params.guidance.txt_cfg}, {"", "--high-noise-img-cfg-scale", "(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)", &high_noise_sample_params.guidance.img_cfg}, {"", "--high-noise-guidance", "(high noise) distilled guidance scale for models with guidance input (default: 3.5)", &high_noise_sample_params.guidance.distilled_guidance}, {"", "--high-noise-slg-scale", "(high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)", &high_noise_sample_params.guidance.slg.scale}, {"", "--high-noise-skip-layer-start", "(high noise) SLG enabling point (default: 0.01)", &high_noise_sample_params.guidance.slg.layer_start}, {"", "--high-noise-skip-layer-end", "(high noise) SLG disabling point (default: 0.2)", &high_noise_sample_params.guidance.slg.layer_end}, {"", "--high-noise-eta", "(high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)", &high_noise_sample_params.eta}, {"", "--strength", "strength for noising/unnoising (default: 0.75)", &strength}, {"", "--pm-style-strength", "", &pm_style_strength}, {"", "--control-strength", "strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image", &control_strength}, {"", "--moe-boundary", "timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1", &moe_boundary}, {"", "--vace-strength", "wan vace strength", &vace_strength}, {"", "--vae-tile-overlap", "tile overlap for vae tiling, in fraction of tile size (default: 0.5)", &vae_tiling_params.target_overlap}, {"", "--hires-scale", "highres fix scale when target size is not set (default: 2.0)", &hires_scale}, {"", "--hires-denoising-strength", "highres fix second pass denoising strength (default: 0.7)", &hires_denoising_strength}, }; options.bool_options = { {"", "--increase-ref-index", "automatically increase the indices of references images based on the order they are listed (starting with 1).", true, &increase_ref_index}, {"", "--disable-auto-resize-ref-image", "disable auto resize of ref images", false, &auto_resize_ref_image}, {"", "--disable-image-metadata", "do not embed generation metadata on image files", false, &embed_image_metadata}, {"", "--vae-tiling", "process vae in tiles to reduce memory usage", true, &vae_tiling_params.enabled}, {"", "--temporal-tiling", "enable temporal tiling for LTX video VAE decode", true, &vae_tiling_params.temporal_tiling}, {"", "--hires", "enable highres fix", true, &hires_enabled}, }; auto on_seed_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } seed = std::stoll(argv[index]); return 1; }; auto on_sample_method_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* arg = argv[index]; sample_params.sample_method = str_to_sample_method(arg); if (sample_params.sample_method == SAMPLE_METHOD_COUNT) { LOG_ERROR("error: invalid sample method %s", arg); return -1; } return 1; }; auto on_high_noise_sample_method_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* arg = argv[index]; high_noise_sample_params.sample_method = str_to_sample_method(arg); if (high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) { LOG_ERROR("error: invalid high noise sample method %s", arg); return -1; } return 1; }; auto on_scheduler_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* arg = argv[index]; sample_params.scheduler = str_to_scheduler(arg); if (sample_params.scheduler == SCHEDULER_COUNT) { LOG_ERROR("error: invalid scheduler %s", arg); return -1; } return 1; }; auto on_skip_layers_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } std::string layers_str = argv[index]; if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { return -1; } layers_str = layers_str.substr(1, layers_str.size() - 2); std::regex regex("[, ]+"); std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); std::sregex_token_iterator end; std::vector tokens(iter, end); std::vector layers; for (const auto& token : tokens) { try { layers.push_back(std::stoi(token)); } catch (const std::invalid_argument&) { return -1; } } skip_layers = layers; return 1; }; auto on_high_noise_skip_layers_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } std::string layers_str = argv[index]; if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { return -1; } layers_str = layers_str.substr(1, layers_str.size() - 2); std::regex regex("[, ]+"); std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); std::sregex_token_iterator end; std::vector tokens(iter, end); std::vector layers; for (const auto& token : tokens) { try { layers.push_back(std::stoi(token)); } catch (const std::invalid_argument&) { return -1; } } high_noise_skip_layers = layers; return 1; }; auto on_sigmas_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } std::string sigmas_str = argv[index]; if (!sigmas_str.empty() && sigmas_str.front() == '[') { sigmas_str.erase(0, 1); } if (!sigmas_str.empty() && sigmas_str.back() == ']') { sigmas_str.pop_back(); } std::stringstream ss(sigmas_str); std::string item; while (std::getline(ss, item, ',')) { item.erase(0, item.find_first_not_of(" \t\n\r\f\v")); item.erase(item.find_last_not_of(" \t\n\r\f\v") + 1); if (!item.empty()) { try { custom_sigmas.push_back(std::stof(item)); } catch (const std::invalid_argument&) { LOG_ERROR("error: invalid float value '%s' in --sigmas", item.c_str()); return -1; } catch (const std::out_of_range&) { LOG_ERROR("error: float value '%s' out of range in --sigmas", item.c_str()); return -1; } } } if (custom_sigmas.empty() && !sigmas_str.empty()) { LOG_ERROR("error: could not parse any sigma values from '%s'", argv[index]); return -1; } return 1; }; auto on_ref_image_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } ref_image_paths.push_back(argv[index]); return 1; }; auto on_cache_mode_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } cache_mode = argv_to_utf8(index, argv); if (cache_mode != "easycache" && cache_mode != "ucache" && cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit" && cache_mode != "spectrum") { fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', 'cache-dit', or 'spectrum'\n", cache_mode.c_str()); return -1; } return 1; }; auto on_cache_option_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } cache_option = argv_to_utf8(index, argv); return 1; }; auto on_scm_mask_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } scm_mask = argv_to_utf8(index, argv); return 1; }; auto on_scm_policy_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } std::string policy = argv_to_utf8(index, argv); if (policy == "dynamic") { scm_policy_dynamic = true; } else if (policy == "static") { scm_policy_dynamic = false; } else { fprintf(stderr, "error: invalid scm policy '%s', must be 'dynamic' or 'static'\n", policy.c_str()); return -1; } return 1; }; auto on_tile_size_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } std::string tile_size_str = argv[index]; size_t x_pos = tile_size_str.find('x'); try { if (x_pos != std::string::npos) { std::string tile_x_str = tile_size_str.substr(0, x_pos); std::string tile_y_str = tile_size_str.substr(x_pos + 1); vae_tiling_params.tile_size_x = std::stoi(tile_x_str); vae_tiling_params.tile_size_y = std::stoi(tile_y_str); } else { vae_tiling_params.tile_size_x = vae_tiling_params.tile_size_y = std::stoi(tile_size_str); } } catch (const std::invalid_argument&) { return -1; } catch (const std::out_of_range&) { return -1; } return 1; }; auto on_relative_tile_size_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } std::string rel_size_str = argv[index]; size_t x_pos = rel_size_str.find('x'); try { if (x_pos != std::string::npos) { std::string rel_x_str = rel_size_str.substr(0, x_pos); std::string rel_y_str = rel_size_str.substr(x_pos + 1); vae_tiling_params.rel_size_x = std::stof(rel_x_str); vae_tiling_params.rel_size_y = std::stof(rel_y_str); } else { vae_tiling_params.rel_size_x = vae_tiling_params.rel_size_y = std::stof(rel_size_str); } } catch (const std::invalid_argument&) { return -1; } catch (const std::out_of_range&) { return -1; } return 1; }; options.manual_options = { {"-s", "--seed", "RNG seed (default: 42, use random seed for < 0)", on_seed_arg}, {"", "--sampling-method", "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde] " "(default: euler for Flux/SD3/Wan, euler_a otherwise)", on_sample_method_arg}, {"", "--high-noise-sampling-method", "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde]" " default: euler for Flux/SD3/Wan, euler_a otherwise", on_high_noise_sample_method_arg}, {"", "--scheduler", "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default: discrete", on_scheduler_arg}, {"", "--sigmas", "custom sigma values for the sampler, comma-separated (e.g., \"14.61,7.8,3.5,0.0\").", on_sigmas_arg}, {"", "--skip-layers", "layers to skip for SLG steps (default: [7,8,9])", on_skip_layers_arg}, {"", "--high-noise-skip-layers", "(high noise) layers to skip for SLG steps (default: [7,8,9])", on_high_noise_skip_layers_arg}, {"-r", "--ref-image", "reference image for Flux Kontext models (can be used multiple times)", on_ref_image_arg}, {"", "--cache-mode", "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)", on_cache_mode_arg}, {"", "--cache-option", "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"", on_cache_option_arg}, {"", "--scm-mask", "SCM steps mask for cache-dit: comma-separated 0/1 (e.g., \"1,1,1,0,0,1,0,0,1,0\") - 1=compute, 0=can cache", on_scm_mask_arg}, {"", "--scm-policy", "SCM policy: 'dynamic' (default) or 'static'", on_scm_policy_arg}, {"", "--vae-tile-size", "tile size for vae tiling, format [X]x[Y] (default: 32x32)", on_tile_size_arg}, {"", "--vae-relative-tile-size", "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)", on_relative_tile_size_arg}, }; return options; } static const std::string k_base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "0123456789+/"; static bool is_base64(unsigned char c) { return std::isalnum(c) || c == '+' || c == '/'; } static std::vector decode_base64_bytes(const std::string& encoded_string) { int in_len = static_cast(encoded_string.size()); int i = 0; int j = 0; int in_ = 0; uint8_t char_array_4[4]; uint8_t char_array_3[3]; std::vector ret; while (in_len-- && encoded_string[in_] != '=' && is_base64(encoded_string[in_])) { char_array_4[i++] = encoded_string[in_]; in_++; if (i == 4) { for (i = 0; i < 4; i++) { char_array_4[i] = static_cast(k_base64_chars.find(char_array_4[i])); } char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4); char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; for (i = 0; i < 3; i++) { ret.push_back(char_array_3[i]); } i = 0; } } if (i) { for (j = i; j < 4; j++) { char_array_4[j] = 0; } for (j = 0; j < 4; j++) { char_array_4[j] = static_cast(k_base64_chars.find(char_array_4[j])); } char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4); char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; for (j = 0; j < i - 1; j++) { ret.push_back(char_array_3[j]); } } return ret; } bool decode_base64_image(const std::string& encoded_input, int target_channels, int expected_width, int expected_height, SDImageOwner& out_image) { std::string encoded = encoded_input; auto comma_pos = encoded.find(','); if (comma_pos != std::string::npos) { encoded = encoded.substr(comma_pos + 1); } std::vector image_bytes = decode_base64_bytes(encoded); if (image_bytes.empty()) { return false; } int decoded_width = 0; int decoded_height = 0; uint8_t* raw_data = load_image_from_memory(reinterpret_cast(image_bytes.data()), static_cast(image_bytes.size()), decoded_width, decoded_height, expected_width, expected_height, target_channels); if (raw_data == nullptr) { return false; } out_image.reset({(uint32_t)decoded_width, (uint32_t)decoded_height, (uint32_t)target_channels, raw_data}); return true; } static bool parse_image_json_field(const json& parent, const char* key, int channels, int expected_width, int expected_height, SDImageOwner& out_image) { if (!parent.contains(key)) { return true; } if (parent.at(key).is_null()) { out_image.reset({0, 0, (uint32_t)channels, nullptr}); return true; } if (!parent.at(key).is_string()) { return false; } return decode_base64_image(parent.at(key).get(), channels, expected_width, expected_height, out_image); } static bool parse_image_array_json_field(const json& parent, const char* key, int channels, int expected_width, int expected_height, std::vector& out_images) { if (!parent.contains(key)) { return true; } if (parent.at(key).is_null()) { out_images.clear(); return true; } if (!parent.at(key).is_array()) { return false; } out_images.clear(); for (const auto& item : parent.at(key)) { if (!item.is_string()) { return false; } SDImageOwner image; if (!decode_base64_image(item.get(), channels, expected_width, expected_height, image)) { return false; } out_images.push_back(std::move(image)); } return true; } static bool parse_lora_json_field(const json& parent, const std::function& lora_path_resolver, std::map& lora_map, std::map& high_noise_lora_map) { if (!parent.contains("lora")) { return true; } if (!parent.at("lora").is_array()) { return false; } lora_map.clear(); high_noise_lora_map.clear(); for (const auto& item : parent.at("lora")) { if (!item.is_object()) { return false; } std::string path = item.value("path", ""); if (path.empty()) { return false; } std::string resolved_path = lora_path_resolver ? lora_path_resolver(path) : path; if (resolved_path.empty()) { return false; } const float multiplier = item.value("multiplier", 1.0f); const bool is_high_noise = item.value("is_high_noise", false); if (is_high_noise) { high_noise_lora_map[resolved_path] += multiplier; } else { lora_map[resolved_path] += multiplier; } } return true; } static bool resolve_model_file_from_dir(const std::string& model_name, const std::string& model_dir, const std::vector& valid_ext, const char* label, std::string& resolved_path) { if (model_dir.empty()) { LOG_ERROR("%s directory is empty", label); return false; } if (model_name.empty() || model_name.find('/') != std::string::npos || model_name.find('\\') != std::string::npos || fs::path(model_name).has_root_path() || fs::path(model_name).has_extension()) { LOG_ERROR("%s must be a model name without path or extension: %s", label, model_name.c_str()); return false; } fs::path model_dir_path = model_dir; for (const auto& ext : valid_ext) { fs::path try_path = model_dir_path / (model_name + ext); if (fs::exists(try_path) && fs::is_regular_file(try_path)) { resolved_path = try_path.lexically_normal().string(); return true; } } LOG_ERROR("can not find %s %s in %s", label, model_name.c_str(), model_dir_path.lexically_normal().string().c_str()); return false; } bool SDGenerationParams::from_json_str( const std::string& json_str, const std::function& lora_path_resolver) { json j; try { j = json::parse(json_str); } catch (...) { LOG_ERROR("json parse failed %s", json_str.c_str()); return false; } auto load_if_exists = [&](const char* key, auto& out) { if (j.contains(key)) { using T = std::decay_t; if constexpr (std::is_same_v) { if (j[key].is_string()) out = j[key]; } else if constexpr (std::is_same_v || std::is_same_v) { if (j[key].is_number_integer()) out = j[key]; } else if constexpr (std::is_same_v) { if (j[key].is_number()) out = j[key]; } else if constexpr (std::is_same_v) { if (j[key].is_boolean()) out = j[key]; } else if constexpr (std::is_same_v>) { if (j[key].is_array()) out = j[key].get>(); } else if constexpr (std::is_same_v>) { if (j[key].is_array()) out = j[key].get>(); } else if constexpr (std::is_same_v>) { if (j[key].is_array()) out = j[key].get>(); } } }; load_if_exists("prompt", prompt); load_if_exists("negative_prompt", negative_prompt); load_if_exists("cache_mode", cache_mode); load_if_exists("cache_option", cache_option); load_if_exists("scm_mask", scm_mask); load_if_exists("clip_skip", clip_skip); load_if_exists("width", width); load_if_exists("height", height); load_if_exists("batch_count", batch_count); load_if_exists("video_frames", video_frames); load_if_exists("fps", fps); load_if_exists("upscale_repeats", upscale_repeats); load_if_exists("seed", seed); load_if_exists("strength", strength); load_if_exists("control_strength", control_strength); load_if_exists("moe_boundary", moe_boundary); load_if_exists("vace_strength", vace_strength); load_if_exists("auto_resize_ref_image", auto_resize_ref_image); load_if_exists("increase_ref_index", increase_ref_index); load_if_exists("embed_image_metadata", embed_image_metadata); if (j.contains("hires") && j["hires"].is_object()) { const json& hires_json = j["hires"]; if (hires_json.contains("enabled") && hires_json["enabled"].is_boolean()) { hires_enabled = hires_json["enabled"]; } if (hires_json.contains("upscaler") && hires_json["upscaler"].is_string()) { hires_upscaler = hires_json["upscaler"]; } if (hires_json.contains("scale") && hires_json["scale"].is_number()) { hires_scale = hires_json["scale"]; } if (hires_json.contains("target_width") && hires_json["target_width"].is_number_integer()) { hires_width = hires_json["target_width"]; } if (hires_json.contains("target_height") && hires_json["target_height"].is_number_integer()) { hires_height = hires_json["target_height"]; } if (hires_json.contains("steps") && hires_json["steps"].is_number_integer()) { hires_steps = hires_json["steps"]; } if (hires_json.contains("denoising_strength") && hires_json["denoising_strength"].is_number()) { hires_denoising_strength = hires_json["denoising_strength"]; } if (hires_json.contains("upscale_tile_size") && hires_json["upscale_tile_size"].is_number_integer()) { hires_upscale_tile_size = hires_json["upscale_tile_size"]; } } auto parse_sample_params_json = [&](const json& sample_json, sd_sample_params_t& target_params, std::vector& target_skip_layers, std::vector* target_custom_sigmas) { if (sample_json.contains("sample_steps") && sample_json["sample_steps"].is_number_integer()) { target_params.sample_steps = sample_json["sample_steps"]; } if (sample_json.contains("eta") && sample_json["eta"].is_number()) { target_params.eta = sample_json["eta"]; } if (sample_json.contains("shifted_timestep") && sample_json["shifted_timestep"].is_number_integer()) { target_params.shifted_timestep = sample_json["shifted_timestep"]; } if (sample_json.contains("flow_shift") && sample_json["flow_shift"].is_number()) { target_params.flow_shift = sample_json["flow_shift"]; } if (target_custom_sigmas != nullptr && sample_json.contains("custom_sigmas") && sample_json["custom_sigmas"].is_array()) { *target_custom_sigmas = sample_json["custom_sigmas"].get>(); } if (sample_json.contains("sample_method") && sample_json["sample_method"].is_string()) { enum sample_method_t tmp = str_to_sample_method(sample_json["sample_method"].get().c_str()); if (tmp != SAMPLE_METHOD_COUNT) { target_params.sample_method = tmp; } } if (sample_json.contains("scheduler") && sample_json["scheduler"].is_string()) { enum scheduler_t tmp = str_to_scheduler(sample_json["scheduler"].get().c_str()); if (tmp != SCHEDULER_COUNT) { target_params.scheduler = tmp; } } if (sample_json.contains("guidance") && sample_json["guidance"].is_object()) { const json& guidance_json = sample_json["guidance"]; if (guidance_json.contains("txt_cfg") && guidance_json["txt_cfg"].is_number()) { target_params.guidance.txt_cfg = guidance_json["txt_cfg"]; } if (guidance_json.contains("img_cfg") && guidance_json["img_cfg"].is_number()) { target_params.guidance.img_cfg = guidance_json["img_cfg"]; } if (guidance_json.contains("distilled_guidance") && guidance_json["distilled_guidance"].is_number()) { target_params.guidance.distilled_guidance = guidance_json["distilled_guidance"]; } if (guidance_json.contains("slg") && guidance_json["slg"].is_object()) { const json& slg_json = guidance_json["slg"]; if (slg_json.contains("layers") && slg_json["layers"].is_array()) { target_skip_layers = slg_json["layers"].get>(); } if (slg_json.contains("layer_start") && slg_json["layer_start"].is_number()) { target_params.guidance.slg.layer_start = slg_json["layer_start"]; } if (slg_json.contains("layer_end") && slg_json["layer_end"].is_number()) { target_params.guidance.slg.layer_end = slg_json["layer_end"]; } if (slg_json.contains("scale") && slg_json["scale"].is_number()) { target_params.guidance.slg.scale = slg_json["scale"]; } } } }; if (j.contains("sample_params") && j["sample_params"].is_object()) { parse_sample_params_json(j["sample_params"], sample_params, skip_layers, &custom_sigmas); } if (j.contains("high_noise_sample_params") && j["high_noise_sample_params"].is_object()) { parse_sample_params_json(j["high_noise_sample_params"], high_noise_sample_params, high_noise_skip_layers, nullptr); } if (j.contains("vae_tiling_params") && j["vae_tiling_params"].is_object()) { const json& tiling_json = j["vae_tiling_params"]; if (tiling_json.contains("enabled") && tiling_json["enabled"].is_boolean()) { vae_tiling_params.enabled = tiling_json["enabled"]; } if (tiling_json.contains("temporal_tiling") && tiling_json["temporal_tiling"].is_boolean()) { vae_tiling_params.temporal_tiling = tiling_json["temporal_tiling"]; } if (tiling_json.contains("tile_size_x") && tiling_json["tile_size_x"].is_number_integer()) { vae_tiling_params.tile_size_x = tiling_json["tile_size_x"]; } if (tiling_json.contains("tile_size_y") && tiling_json["tile_size_y"].is_number_integer()) { vae_tiling_params.tile_size_y = tiling_json["tile_size_y"]; } if (tiling_json.contains("target_overlap") && tiling_json["target_overlap"].is_number()) { vae_tiling_params.target_overlap = tiling_json["target_overlap"]; } if (tiling_json.contains("rel_size_x") && tiling_json["rel_size_x"].is_number()) { vae_tiling_params.rel_size_x = tiling_json["rel_size_x"]; } if (tiling_json.contains("rel_size_y") && tiling_json["rel_size_y"].is_number()) { vae_tiling_params.rel_size_y = tiling_json["rel_size_y"]; } } if (!parse_lora_json_field(j, lora_path_resolver, lora_map, high_noise_lora_map)) { LOG_ERROR("invalid lora"); return false; } if (!parse_image_json_field(j, "init_image", 3, width, height, init_image)) { LOG_ERROR("invalid init_image"); return false; } if (!parse_image_json_field(j, "end_image", 3, width, height, end_image)) { LOG_ERROR("invalid end_image"); return false; } if (!parse_image_array_json_field(j, "ref_images", 3, width, height, ref_images)) { LOG_ERROR("invalid ref_images"); return false; } if (!parse_image_array_json_field(j, "control_frames", 3, width, height, control_frames)) { LOG_ERROR("invalid control_frames"); return false; } if (!parse_image_json_field(j, "mask_image", 1, width, height, mask_image)) { LOG_ERROR("invalid mask_image"); return false; } if (!parse_image_json_field(j, "control_image", 3, width, height, control_image)) { LOG_ERROR("invalid control_image"); return false; } return true; } void SDGenerationParams::extract_and_remove_lora(const std::string& lora_model_dir) { if (lora_model_dir.empty()) { return; } static const std::regex re(R"(]+):([^>]+)>)"); static const std::vector valid_ext = {".gguf", ".safetensors", ".pt"}; std::smatch m; std::string tmp = prompt; while (std::regex_search(tmp, m, re)) { std::string raw_path = m[1].str(); const std::string raw_mul = m[2].str(); float mul = 0.f; try { mul = std::stof(raw_mul); } catch (...) { tmp = m.suffix().str(); prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only); continue; } bool is_high_noise = false; static const std::string prefix = "|high_noise|"; if (raw_path.rfind(prefix, 0) == 0) { raw_path.erase(0, prefix.size()); is_high_noise = true; } fs::path final_path; if (is_absolute_path(raw_path)) { final_path = raw_path; } else { final_path = fs::path(lora_model_dir) / raw_path; } if (!fs::exists(final_path)) { bool found = false; for (const auto& ext : valid_ext) { fs::path try_path = final_path; try_path += ext; if (fs::exists(try_path)) { final_path = try_path; found = true; break; } } if (!found) { LOG_WARN("can not found lora %s", final_path.lexically_normal().string().c_str()); tmp = m.suffix().str(); prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only); continue; } } const std::string key = final_path.lexically_normal().string(); if (is_high_noise) high_noise_lora_map[key] += mul; else lora_map[key] += mul; prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only); tmp = m.suffix().str(); } } bool SDGenerationParams::width_and_height_are_set() const { return width > 0 && height > 0; } void SDGenerationParams::set_width_and_height_if_unset(int w, int h) { if (!width_and_height_are_set()) { LOG_INFO("set width x height to %d x %d", w, h); width = w; height = h; } } int SDGenerationParams::get_resolved_width() const { return (width > 0) ? width : 512; } int SDGenerationParams::get_resolved_height() const { return (height > 0) ? height : 512; } bool SDGenerationParams::initialize_cache_params() { sd_cache_params_init(&cache_params); auto parse_named_params = [&](const std::string& opt_str) -> bool { std::stringstream ss(opt_str); std::string token; while (std::getline(ss, token, ',')) { size_t eq_pos = token.find('='); if (eq_pos == std::string::npos) { LOG_ERROR("error: cache option '%s' missing '=' separator", token.c_str()); return false; } std::string key = token.substr(0, eq_pos); std::string val = token.substr(eq_pos + 1); try { if (key == "threshold") { if (cache_mode == "easycache" || cache_mode == "ucache") { cache_params.reuse_threshold = std::stof(val); } else { cache_params.residual_diff_threshold = std::stof(val); } } else if (key == "start") { cache_params.start_percent = std::stof(val); } else if (key == "end") { cache_params.end_percent = std::stof(val); } else if (key == "decay") { cache_params.error_decay_rate = std::stof(val); } else if (key == "relative") { cache_params.use_relative_threshold = (std::stof(val) != 0.0f); } else if (key == "reset") { cache_params.reset_error_on_compute = (std::stof(val) != 0.0f); } else if (key == "Fn" || key == "fn") { cache_params.Fn_compute_blocks = std::stoi(val); } else if (key == "Bn" || key == "bn") { cache_params.Bn_compute_blocks = std::stoi(val); } else if (key == "warmup") { if (cache_mode == "spectrum") { cache_params.spectrum_warmup_steps = std::stoi(val); } else { cache_params.max_warmup_steps = std::stoi(val); } } else if (key == "w") { cache_params.spectrum_w = std::stof(val); } else if (key == "m") { cache_params.spectrum_m = std::stoi(val); } else if (key == "lam") { cache_params.spectrum_lam = std::stof(val); } else if (key == "window") { cache_params.spectrum_window_size = std::stoi(val); } else if (key == "flex") { cache_params.spectrum_flex_window = std::stof(val); } else if (key == "stop") { cache_params.spectrum_stop_percent = std::stof(val); } else { LOG_ERROR("error: unknown cache parameter '%s'", key.c_str()); return false; } } catch (const std::exception&) { LOG_ERROR("error: invalid value '%s' for parameter '%s'", val.c_str(), key.c_str()); return false; } } return true; }; if (!cache_mode.empty()) { if (cache_mode == "disabled") { cache_params.mode = SD_CACHE_DISABLED; } else if (cache_mode == "easycache") { cache_params.mode = SD_CACHE_EASYCACHE; } else if (cache_mode == "ucache") { cache_params.mode = SD_CACHE_UCACHE; } else if (cache_mode == "dbcache") { cache_params.mode = SD_CACHE_DBCACHE; } else if (cache_mode == "taylorseer") { cache_params.mode = SD_CACHE_TAYLORSEER; } else if (cache_mode == "cache-dit") { cache_params.mode = SD_CACHE_CACHE_DIT; } else if (cache_mode == "spectrum") { cache_params.mode = SD_CACHE_SPECTRUM; } else { LOG_ERROR("error: invalid cache mode '%s'", cache_mode.c_str()); return false; } } if (!cache_option.empty() && !parse_named_params(cache_option)) { return false; } if (cache_params.mode == SD_CACHE_DBCACHE || cache_params.mode == SD_CACHE_TAYLORSEER || cache_params.mode == SD_CACHE_CACHE_DIT) { cache_params.scm_policy_dynamic = scm_policy_dynamic; } return true; } bool SDGenerationParams::resolve(const std::string& lora_model_dir, const std::string& hires_upscalers_dir, bool strict) { if (high_noise_sample_params.sample_steps <= 0) { high_noise_sample_params.sample_steps = -1; } if (!initialize_cache_params()) { return false; } if (seed < 0) { srand((int)time(nullptr)); seed = rand(); } if (strict) { batch_count = std::clamp(batch_count, 1, 8); sample_params.sample_steps = std::clamp(sample_params.sample_steps, 1, 100); } hires_upscaler_model_path.clear(); if (hires_enabled) { if (hires_upscaler.empty()) { hires_upscaler = "Latent"; } resolved_hires_upscaler = str_to_sd_hires_upscaler(hires_upscaler.c_str()); if (resolved_hires_upscaler == SD_HIRES_UPSCALER_NONE) { hires_enabled = false; } else if (resolved_hires_upscaler == SD_HIRES_UPSCALER_COUNT) { static const std::vector valid_ext = {".gguf", ".safetensors", ".pt", ".pth"}; if (!resolve_model_file_from_dir(hires_upscaler, hires_upscalers_dir, valid_ext, "hires upscaler", hires_upscaler_model_path)) { return false; } resolved_hires_upscaler = SD_HIRES_UPSCALER_MODEL; } } prompt_with_lora = prompt; if (!lora_model_dir.empty()) { extract_and_remove_lora(lora_model_dir); } return true; } bool SDGenerationParams::validate(SDMode mode) { if (batch_count <= 0) { LOG_ERROR("error: batch_count must be greater than 0"); return false; } if (sample_params.sample_steps <= 0) { LOG_ERROR("error: the sample_steps must be greater than 0\n"); return false; } if (strength < 0.f || strength > 1.f) { LOG_ERROR("error: can only work with strength in [0.0, 1.0]\n"); return false; } if (sample_params.guidance.txt_cfg < 0.f) { LOG_ERROR("error: cfg_scale must be positive"); return false; } if (!cache_mode.empty()) { if (cache_mode == "easycache" || cache_mode == "ucache") { if (cache_params.reuse_threshold < 0.0f) { LOG_ERROR("error: cache threshold must be non-negative"); return false; } if (cache_params.start_percent < 0.0f || cache_params.start_percent >= 1.0f || cache_params.end_percent <= 0.0f || cache_params.end_percent > 1.0f || cache_params.start_percent >= cache_params.end_percent) { LOG_ERROR("error: cache start/end percents must satisfy 0.0 <= start < end <= 1.0"); return false; } } } if (mode == VID_GEN && video_frames <= 0) { return false; } if (mode == VID_GEN && fps <= 0) { return false; } if (sample_params.shifted_timestep < 0 || sample_params.shifted_timestep > 1000) { LOG_ERROR("error: shifted_timestep must be in range [0, 1000]"); return false; } if (upscale_repeats < 1) { return false; } if (upscale_tile_size < 1) { return false; } if (hires_enabled) { if (hires_width < 0 || hires_height < 0) { LOG_ERROR("error: hires target width and height must be >= 0"); return false; } if (hires_scale <= 0.f && hires_width <= 0 && hires_height <= 0) { LOG_ERROR("error: hires scale must be positive when target size is not set"); return false; } if (hires_steps < 0) { LOG_ERROR("error: hires steps must be >= 0"); return false; } if (hires_denoising_strength <= 0.f || hires_denoising_strength > 1.f) { LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]"); return false; } if (hires_upscale_tile_size < 1) { LOG_ERROR("error: hires upscale tile size must be positive"); return false; } } if (mode == UPSCALE) { if (init_image_path.length() == 0) { LOG_ERROR("error: upscale mode needs an init image (--init-img)\n"); return false; } } return true; } bool SDGenerationParams::resolve_and_validate(SDMode mode, const std::string& lora_model_dir, const std::string& hires_upscalers_dir, bool strict) { if (!resolve(lora_model_dir, hires_upscalers_dir, strict)) { return false; } if (!validate(mode)) { return false; } return true; } sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() { sd_img_gen_params_t params; sd_img_gen_params_init(¶ms); lora_vec.clear(); lora_vec.reserve(lora_map.size() + high_noise_lora_map.size()); for (const auto& kv : lora_map) { lora_vec.push_back({false, kv.second, kv.first.c_str()}); } for (const auto& kv : high_noise_lora_map) { lora_vec.push_back({true, kv.second, kv.first.c_str()}); } ref_image_views.clear(); ref_image_views.reserve(ref_images.size()); for (auto& ref_image : ref_images) { ref_image_views.push_back(ref_image.get()); } pm_id_image_views.clear(); pm_id_image_views.reserve(pm_id_images.size()); for (auto& image : pm_id_images) { pm_id_image_views.push_back(image.get()); } sample_params.guidance.slg.layers = skip_layers.empty() ? nullptr : skip_layers.data(); sample_params.guidance.slg.layer_count = skip_layers.size(); high_noise_sample_params.guidance.slg.layers = high_noise_skip_layers.empty() ? nullptr : high_noise_skip_layers.data(); high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size(); sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data(); sample_params.custom_sigmas_count = static_cast(custom_sigmas.size()); cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str(); sd_pm_params_t pm_params = { pm_id_image_views.empty() ? nullptr : pm_id_image_views.data(), static_cast(pm_id_image_views.size()), pm_id_embed_path.empty() ? nullptr : pm_id_embed_path.c_str(), pm_style_strength, }; params.loras = lora_vec.empty() ? nullptr : lora_vec.data(); params.lora_count = static_cast(lora_vec.size()); params.prompt = prompt.c_str(); params.negative_prompt = negative_prompt.c_str(); params.clip_skip = clip_skip; params.init_image = init_image.get(); params.ref_images = ref_image_views.empty() ? nullptr : ref_image_views.data(); params.ref_images_count = static_cast(ref_image_views.size()); params.auto_resize_ref_image = auto_resize_ref_image; params.increase_ref_index = increase_ref_index; params.mask_image = mask_image.get(); params.width = get_resolved_width(); params.height = get_resolved_height(); params.sample_params = sample_params; params.strength = strength; params.seed = seed; params.batch_count = batch_count; params.control_image = control_image.get(); params.control_strength = control_strength; params.pm_params = pm_params; params.vae_tiling_params = vae_tiling_params; params.cache = cache_params; params.hires.enabled = hires_enabled; params.hires.upscaler = resolved_hires_upscaler; params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str(); params.hires.scale = hires_scale; params.hires.target_width = hires_width; params.hires.target_height = hires_height; params.hires.steps = hires_steps; params.hires.denoising_strength = hires_denoising_strength; params.hires.upscale_tile_size = hires_upscale_tile_size; return params; } sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() { sd_vid_gen_params_t params; sd_vid_gen_params_init(¶ms); lora_vec.clear(); lora_vec.reserve(lora_map.size() + high_noise_lora_map.size()); for (const auto& kv : lora_map) { lora_vec.push_back({false, kv.second, kv.first.c_str()}); } for (const auto& kv : high_noise_lora_map) { lora_vec.push_back({true, kv.second, kv.first.c_str()}); } control_frame_views.clear(); control_frame_views.reserve(control_frames.size()); for (auto& frame : control_frames) { control_frame_views.push_back(frame.get()); } sample_params.guidance.slg.layers = skip_layers.empty() ? nullptr : skip_layers.data(); sample_params.guidance.slg.layer_count = skip_layers.size(); high_noise_sample_params.guidance.slg.layers = high_noise_skip_layers.empty() ? nullptr : high_noise_skip_layers.data(); high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size(); sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data(); sample_params.custom_sigmas_count = static_cast(custom_sigmas.size()); cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str(); params.loras = lora_vec.empty() ? nullptr : lora_vec.data(); params.lora_count = static_cast(lora_vec.size()); params.prompt = prompt.c_str(); params.negative_prompt = negative_prompt.c_str(); params.clip_skip = clip_skip; params.init_image = init_image.get(); params.end_image = end_image.get(); params.control_frames = control_frame_views.empty() ? nullptr : control_frame_views.data(); params.control_frames_size = static_cast(control_frame_views.size()); params.width = get_resolved_width(); params.height = get_resolved_height(); params.sample_params = sample_params; params.high_noise_sample_params = high_noise_sample_params; params.moe_boundary = moe_boundary; params.strength = strength; params.seed = seed; params.video_frames = video_frames; params.fps = fps; params.vace_strength = vace_strength; params.vae_tiling_params = vae_tiling_params; params.cache = cache_params; return params; } std::string SDGenerationParams::to_string() const { FreeUniquePtr sample_params_str(sd_sample_params_to_str(&sample_params)); FreeUniquePtr high_noise_sample_params_str(sd_sample_params_to_str(&high_noise_sample_params)); std::ostringstream lora_ss; lora_ss << "{\n"; for (auto it = lora_map.begin(); it != lora_map.end(); ++it) { lora_ss << " \"" << it->first << "\": \"" << it->second << "\""; if (std::next(it) != lora_map.end()) { lora_ss << ","; } lora_ss << "\n"; } lora_ss << " }"; std::string loras_str = lora_ss.str(); lora_ss = std::ostringstream(); ; lora_ss << "{\n"; for (auto it = high_noise_lora_map.begin(); it != high_noise_lora_map.end(); ++it) { lora_ss << " \"" << it->first << "\": \"" << it->second << "\""; if (std::next(it) != high_noise_lora_map.end()) { lora_ss << ","; } lora_ss << "\n"; } lora_ss << " }"; std::string high_noise_loras_str = lora_ss.str(); std::ostringstream oss; oss << "SDGenerationParams {\n" << " loras: \"" << loras_str << "\",\n" << " high_noise_loras: \"" << high_noise_loras_str << "\",\n" << " prompt: \"" << prompt << "\",\n" << " negative_prompt: \"" << negative_prompt << "\",\n" << " clip_skip: " << clip_skip << ",\n" << " width: " << width << ",\n" << " height: " << height << ",\n" << " batch_count: " << batch_count << ",\n" << " init_image_path: \"" << init_image_path << "\",\n" << " end_image_path: \"" << end_image_path << "\",\n" << " mask_image_path: \"" << mask_image_path << "\",\n" << " control_image_path: \"" << control_image_path << "\",\n" << " ref_image_paths: " << vec_str_to_string(ref_image_paths) << ",\n" << " control_video_path: \"" << control_video_path << "\",\n" << " auto_resize_ref_image: " << (auto_resize_ref_image ? "true" : "false") << ",\n" << " increase_ref_index: " << (increase_ref_index ? "true" : "false") << ",\n" << " pm_id_images_dir: \"" << pm_id_images_dir << "\",\n" << " pm_id_embed_path: \"" << pm_id_embed_path << "\",\n" << " pm_style_strength: " << pm_style_strength << ",\n" << " skip_layers: " << vec_to_string(skip_layers) << ",\n" << " sample_params: " << SAFE_STR(sample_params_str.get()) << ",\n" << " high_noise_skip_layers: " << vec_to_string(high_noise_skip_layers) << ",\n" << " high_noise_sample_params: " << SAFE_STR(high_noise_sample_params_str.get()) << ",\n" << " custom_sigmas: " << vec_to_string(custom_sigmas) << ",\n" << " cache_mode: \"" << cache_mode << "\",\n" << " cache_option: \"" << cache_option << "\",\n" << " cache: " << (cache_params.mode != SD_CACHE_DISABLED ? "enabled" : "disabled") << " (threshold=" << cache_params.reuse_threshold << ", start=" << cache_params.start_percent << ", end=" << cache_params.end_percent << "),\n" << " moe_boundary: " << moe_boundary << ",\n" << " video_frames: " << video_frames << ",\n" << " fps: " << fps << ",\n" << " vace_strength: " << vace_strength << ",\n" << " strength: " << strength << ",\n" << " control_strength: " << control_strength << ",\n" << " seed: " << seed << ",\n" << " upscale_repeats: " << upscale_repeats << ",\n" << " upscale_tile_size: " << upscale_tile_size << ",\n" << " hires: { enabled: " << (hires_enabled ? "true" : "false") << ", upscaler: \"" << hires_upscaler << "\"" << ", model_path: \"" << hires_upscaler_model_path << "\"" << ", scale: " << hires_scale << ", target_width: " << hires_width << ", target_height: " << hires_height << ", steps: " << hires_steps << ", denoising_strength: " << hires_denoising_strength << ", upscale_tile_size: " << hires_upscale_tile_size << " },\n" << " vae_tiling_params: { " << vae_tiling_params.enabled << ", " << vae_tiling_params.temporal_tiling << ", " << vae_tiling_params.tile_size_x << ", " << vae_tiling_params.tile_size_y << ", " << vae_tiling_params.target_overlap << ", " << vae_tiling_params.rel_size_x << ", " << vae_tiling_params.rel_size_y << " },\n" << "}"; return oss.str(); } std::string version_string() { return std::string("stable-diffusion.cpp version ") + sd_version() + ", commit " + sd_commit(); } static std::string safe_json_string(const char* value) { return value ? value : ""; } static void set_json_basename_if_not_empty(json& target, const char* key, const std::string& path) { if (!path.empty()) { target[key] = sd_basename(path); } } static json build_sampling_metadata_json(const sd_sample_params_t& sample_params, const std::vector& skip_layers, const std::vector* custom_sigmas = nullptr) { json sampling = { {"steps", sample_params.sample_steps}, {"eta", sample_params.eta}, {"shifted_timestep", sample_params.shifted_timestep}, {"flow_shift", sample_params.flow_shift}, {"guidance", { {"txt_cfg", sample_params.guidance.txt_cfg}, {"img_cfg", sample_params.guidance.img_cfg}, {"distilled_guidance", sample_params.guidance.distilled_guidance}, {"slg", { {"scale", sample_params.guidance.slg.scale}, {"layers", skip_layers}, {"start", sample_params.guidance.slg.layer_start}, {"end", sample_params.guidance.slg.layer_end}, }}, }}, }; if (sample_params.sample_method != SAMPLE_METHOD_COUNT) { sampling["method"] = safe_json_string(sd_sample_method_name(sample_params.sample_method)); } if (sample_params.scheduler != SCHEDULER_COUNT) { sampling["scheduler"] = safe_json_string(sd_scheduler_name(sample_params.scheduler)); } if (custom_sigmas != nullptr) { sampling["custom_sigmas"] = *custom_sigmas; } return sampling; } std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed, SDMode mode) { json root; root["schema"] = "sdcpp.image.params/v1"; root["mode"] = mode == VID_GEN ? "vid_gen" : "img_gen"; root["generator"] = { {"name", "stable-diffusion.cpp"}, {"version", safe_json_string(sd_version())}, {"commit", safe_json_string(sd_commit())}, }; root["seed"] = seed; root["width"] = gen_params.get_resolved_width(); root["height"] = gen_params.get_resolved_height(); root["prompt"] = { {"positive", gen_params.prompt}, {"negative", gen_params.negative_prompt}, }; root["sampling"] = build_sampling_metadata_json(gen_params.sample_params, gen_params.skip_layers, &gen_params.custom_sigmas); json models; set_json_basename_if_not_empty(models, "model", ctx_params.model_path); set_json_basename_if_not_empty(models, "clip_l", ctx_params.clip_l_path); set_json_basename_if_not_empty(models, "clip_g", ctx_params.clip_g_path); set_json_basename_if_not_empty(models, "clip_vision", ctx_params.clip_vision_path); set_json_basename_if_not_empty(models, "t5xxl", ctx_params.t5xxl_path); set_json_basename_if_not_empty(models, "llm", ctx_params.llm_path); set_json_basename_if_not_empty(models, "llm_vision", ctx_params.llm_vision_path); set_json_basename_if_not_empty(models, "diffusion_model", ctx_params.diffusion_model_path); set_json_basename_if_not_empty(models, "high_noise_diffusion_model", ctx_params.high_noise_diffusion_model_path); set_json_basename_if_not_empty(models, "vae", ctx_params.vae_path); set_json_basename_if_not_empty(models, "taesd", ctx_params.taesd_path); set_json_basename_if_not_empty(models, "control_net", ctx_params.control_net_path); root["models"] = std::move(models); root["clip_skip"] = gen_params.clip_skip; root["strength"] = gen_params.strength; root["control_strength"] = gen_params.control_strength; root["auto_resize_ref_image"] = gen_params.auto_resize_ref_image; root["increase_ref_index"] = gen_params.increase_ref_index; if (mode == VID_GEN) { root["video"] = { {"frame_count", gen_params.video_frames}, {"fps", gen_params.fps}, }; root["moe_boundary"] = gen_params.moe_boundary; root["vace_strength"] = gen_params.vace_strength; root["high_noise_sampling"] = build_sampling_metadata_json(gen_params.high_noise_sample_params, gen_params.high_noise_skip_layers); } root["rng"] = safe_json_string(sd_rng_type_name(ctx_params.rng_type)); if (ctx_params.sampler_rng_type != RNG_TYPE_COUNT) { root["sampler_rng"] = safe_json_string(sd_rng_type_name(ctx_params.sampler_rng_type)); } json loras = json::array(); for (const auto& entry : gen_params.lora_map) { loras.push_back({ {"name", sd_basename(entry.first)}, {"multiplier", entry.second}, {"is_high_noise", false}, }); } for (const auto& entry : gen_params.high_noise_lora_map) { loras.push_back({ {"name", sd_basename(entry.first)}, {"multiplier", entry.second}, {"is_high_noise", true}, }); } if (!loras.empty()) { root["loras"] = std::move(loras); } if (gen_params.hires_enabled) { root["hires"] = { {"enabled", gen_params.hires_enabled}, {"upscaler", gen_params.hires_upscaler}, {"model", gen_params.hires_upscaler_model_path.empty() ? "" : sd_basename(gen_params.hires_upscaler_model_path)}, {"scale", gen_params.hires_scale}, {"target_width", gen_params.hires_width}, {"target_height", gen_params.hires_height}, {"steps", gen_params.hires_steps}, {"denoising_strength", gen_params.hires_denoising_strength}, {"upscale_tile_size", gen_params.hires_upscale_tile_size}, }; } if (gen_params.cache_params.mode != SD_CACHE_DISABLED) { root["cache"] = { {"requested_mode", gen_params.cache_mode}, {"requested_option", gen_params.cache_option}, {"mode", gen_params.cache_params.mode}, {"scm_mask", gen_params.scm_mask}, {"scm_policy_dynamic", gen_params.scm_policy_dynamic}, {"reuse_threshold", gen_params.cache_params.reuse_threshold}, {"start_percent", gen_params.cache_params.start_percent}, {"end_percent", gen_params.cache_params.end_percent}, {"error_decay_rate", gen_params.cache_params.error_decay_rate}, {"use_relative_threshold", gen_params.cache_params.use_relative_threshold}, {"reset_error_on_compute", gen_params.cache_params.reset_error_on_compute}, {"Fn_compute_blocks", gen_params.cache_params.Fn_compute_blocks}, {"Bn_compute_blocks", gen_params.cache_params.Bn_compute_blocks}, {"residual_diff_threshold", gen_params.cache_params.residual_diff_threshold}, {"max_warmup_steps", gen_params.cache_params.max_warmup_steps}, {"max_cached_steps", gen_params.cache_params.max_cached_steps}, {"max_continuous_cached_steps", gen_params.cache_params.max_continuous_cached_steps}, {"taylorseer_n_derivatives", gen_params.cache_params.taylorseer_n_derivatives}, {"taylorseer_skip_interval", gen_params.cache_params.taylorseer_skip_interval}, {"spectrum_w", gen_params.cache_params.spectrum_w}, {"spectrum_m", gen_params.cache_params.spectrum_m}, {"spectrum_lam", gen_params.cache_params.spectrum_lam}, {"spectrum_window_size", gen_params.cache_params.spectrum_window_size}, {"spectrum_flex_window", gen_params.cache_params.spectrum_flex_window}, {"spectrum_warmup_steps", gen_params.cache_params.spectrum_warmup_steps}, {"spectrum_stop_percent", gen_params.cache_params.spectrum_stop_percent}, }; } if (gen_params.vae_tiling_params.enabled) { root["vae_tiling"] = { {"enabled", gen_params.vae_tiling_params.enabled}, {"tile_size_x", gen_params.vae_tiling_params.tile_size_x}, {"tile_size_y", gen_params.vae_tiling_params.tile_size_y}, {"target_overlap", gen_params.vae_tiling_params.target_overlap}, {"rel_size_x", gen_params.vae_tiling_params.rel_size_x}, {"rel_size_y", gen_params.vae_tiling_params.rel_size_y}, }; } return root.dump(); } std::string get_image_params(const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed, SDMode mode) { std::string parameter_string; if (gen_params.prompt_with_lora.size() != 0) { parameter_string += gen_params.prompt_with_lora + "\n"; } else { parameter_string += gen_params.prompt + "\n"; } if (gen_params.negative_prompt.size() != 0) { parameter_string += "Negative prompt: " + gen_params.negative_prompt + "\n"; } parameter_string += "Steps: " + std::to_string(gen_params.sample_params.sample_steps) + ", "; parameter_string += "CFG scale: " + std::to_string(gen_params.sample_params.guidance.txt_cfg) + ", "; if (gen_params.sample_params.guidance.slg.scale != 0 && gen_params.skip_layers.size() != 0) { parameter_string += "SLG scale: " + std::to_string(gen_params.sample_params.guidance.slg.scale) + ", "; parameter_string += "Skip layers: ["; for (const auto& layer : gen_params.skip_layers) { parameter_string += std::to_string(layer) + ", "; } parameter_string += "], "; parameter_string += "Skip layer start: " + std::to_string(gen_params.sample_params.guidance.slg.layer_start) + ", "; parameter_string += "Skip layer end: " + std::to_string(gen_params.sample_params.guidance.slg.layer_end) + ", "; } parameter_string += "Guidance: " + std::to_string(gen_params.sample_params.guidance.distilled_guidance) + ", "; parameter_string += "Eta: " + std::to_string(gen_params.sample_params.eta) + ", "; parameter_string += "Seed: " + std::to_string(seed) + ", "; parameter_string += "Size: " + std::to_string(gen_params.get_resolved_width()) + "x" + std::to_string(gen_params.get_resolved_height()) + ", "; parameter_string += "Model: " + sd_basename(ctx_params.model_path) + ", "; parameter_string += "RNG: " + std::string(sd_rng_type_name(ctx_params.rng_type)) + ", "; if (ctx_params.sampler_rng_type != RNG_TYPE_COUNT) { parameter_string += "Sampler RNG: " + std::string(sd_rng_type_name(ctx_params.sampler_rng_type)) + ", "; } parameter_string += "Sampler: " + std::string(sd_sample_method_name(gen_params.sample_params.sample_method)); if (!gen_params.custom_sigmas.empty()) { parameter_string += ", Custom Sigmas: ["; for (size_t i = 0; i < gen_params.custom_sigmas.size(); ++i) { std::ostringstream oss; oss << std::fixed << std::setprecision(4) << gen_params.custom_sigmas[i]; parameter_string += oss.str() + (i == gen_params.custom_sigmas.size() - 1 ? "" : ", "); } parameter_string += "]"; } else if (gen_params.sample_params.scheduler != SCHEDULER_COUNT) { // Only show schedule if not using custom sigmas parameter_string += " " + std::string(sd_scheduler_name(gen_params.sample_params.scheduler)); } parameter_string += ", "; for (const auto& te : {ctx_params.clip_l_path, ctx_params.clip_g_path, ctx_params.t5xxl_path, ctx_params.llm_path, ctx_params.llm_vision_path}) { if (!te.empty()) { parameter_string += "TE: " + sd_basename(te) + ", "; } } if (!ctx_params.diffusion_model_path.empty()) { parameter_string += "Unet: " + sd_basename(ctx_params.diffusion_model_path) + ", "; } if (!ctx_params.vae_path.empty()) { parameter_string += "VAE: " + sd_basename(ctx_params.vae_path) + ", "; } if (gen_params.clip_skip != -1) { parameter_string += "Clip skip: " + std::to_string(gen_params.clip_skip) + ", "; } if (gen_params.hires_enabled) { parameter_string += "Hires upscale: " + gen_params.hires_upscaler + ", "; parameter_string += "Hires scale: " + std::to_string(gen_params.hires_scale) + ", "; parameter_string += "Hires resize: " + std::to_string(gen_params.hires_width) + "x" + std::to_string(gen_params.hires_height) + ", "; parameter_string += "Hires steps: " + std::to_string(gen_params.hires_steps) + ", "; parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", "; } parameter_string += "Version: stable-diffusion.cpp"; parameter_string += ", SDCPP: " + build_sdcpp_image_metadata_json(ctx_params, gen_params, seed, mode); return parameter_string; }