diff --git a/README.md b/README.md index 0a27bc1..40ac8f3 100644 --- a/README.md +++ b/README.md @@ -358,6 +358,7 @@ arguments: --rng {std_default, cuda} RNG (default: cuda) -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0) -b, --batch-count COUNT number of images to generate + --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1) <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x --vae-tiling process vae in tiles to reduce memory usage diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index b1d83a0..41c17fe 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -84,6 +84,7 @@ struct SDParams { std::string prompt; std::string negative_prompt; + int clip_skip = -1; // <= 0 represents unspecified int width = 512; int height = 512; @@ -127,6 +128,8 @@ struct SDParams { int chroma_t5_mask_pad = 1; float flow_shift = INFINITY; + prediction_t prediction = DEFAULT_PRED; + sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; SDParams() { @@ -188,6 +191,7 @@ void print_params(SDParams params) { printf(" sample_params: %s\n", SAFE_STR(sample_params_str)); printf(" high_noise_sample_params: %s\n", SAFE_STR(high_noise_sample_params_str)); printf(" moe_boundary: %.3f\n", params.moe_boundary); + printf(" prediction: %s\n", sd_prediction_name(params.prediction)); printf(" flow_shift: %.2f\n", params.flow_shift); printf(" strength(img2img): %.2f\n", params.strength); printf(" rng: %s\n", sd_rng_type_name(params.rng_type)); @@ -281,6 +285,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n"); printf(" -b, --batch-count COUNT number of images to generate\n"); + printf(" --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override.\n"); printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n"); printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n"); printf(" --vae-tiling process vae in tiles to reduce memory usage\n"); @@ -651,6 +656,20 @@ void parse_args(int argc, const char** argv, SDParams& params) { return 1; }; + auto on_prediction_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.prediction = str_to_prediction(arg); + if (params.prediction == PREDICTION_COUNT) { + fprintf(stderr, "error: invalid prediction type %s\n", + arg); + return -1; + } + return 1; + }; + auto on_sample_method_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; @@ -807,6 +826,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--rng", "", on_rng_arg}, {"-s", "--seed", "", on_seed_arg}, {"", "--sampling-method", "", on_sample_method_arg}, + {"", "--prediction", "", on_prediction_arg}, {"", "--scheduler", "", on_schedule_arg}, {"", "--skip-layers", "", on_skip_layers_arg}, {"", "--high-noise-sampling-method", "", on_high_noise_sample_method_arg}, @@ -1354,6 +1374,7 @@ int main(int argc, const char* argv[]) { params.n_threads, params.wtype, params.rng_type, + params.prediction, params.offload_params_to_cpu, params.clip_on_cpu, params.control_net_cpu, diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 4291280..35ff2c6 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -700,64 +700,102 @@ public: ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM"); } - // check is_using_v_parameterization_for_sd2 - if (sd_version_is_sd2(version)) { - if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) { - is_using_v_parameterization = true; - } - } else if (sd_version_is_sdxl(version)) { - if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) { - // CosXL models - // TODO: get sigma_min and sigma_max values from file - is_using_edm_v_parameterization = true; - } - if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) { - is_using_v_parameterization = true; - } - } else if (version == VERSION_SVD) { - // TODO: V_PREDICTION_EDM - is_using_v_parameterization = true; - } - - if (sd_version_is_sd3(version)) { - LOG_INFO("running in FLOW mode"); - float shift = sd_ctx_params->flow_shift; - if (shift == INFINITY) { - shift = 3.0; - } - denoiser = std::make_shared(shift); - } else if (sd_version_is_flux(version)) { - LOG_INFO("running in Flux FLOW mode"); - float shift = 1.0f; // TODO: validate - for (auto pair : model_loader.tensor_storages_types) { - if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) { - shift = 1.15f; + if (sd_ctx_params->prediction != DEFAULT_PRED) { + switch (sd_ctx_params->prediction) { + case EPS_PRED: + LOG_INFO("running in eps-prediction mode"); + break; + case V_PRED: + LOG_INFO("running in v-prediction mode"); + denoiser = std::make_shared(); + break; + case EDM_V_PRED: + LOG_INFO("running in v-prediction EDM mode"); + denoiser = std::make_shared(); + break; + case SD3_FLOW_PRED: { + LOG_INFO("running in FLOW mode"); + float shift = sd_ctx_params->flow_shift; + if (shift == INFINITY) { + shift = 3.0; + } + denoiser = std::make_shared(shift); break; } + case FLUX_FLOW_PRED: { + LOG_INFO("running in Flux FLOW mode"); + float shift = sd_ctx_params->flow_shift; + if (shift == INFINITY) { + shift = 3.0; + } + denoiser = std::make_shared(shift); + break; + } + default: { + LOG_ERROR("Unknown parametrization %i", sd_ctx_params->prediction); + return false; + } } - denoiser = std::make_shared(shift); - } else if (sd_version_is_wan(version)) { - LOG_INFO("running in FLOW mode"); - float shift = sd_ctx_params->flow_shift; - if (shift == INFINITY) { - shift = 5.0; - } - denoiser = std::make_shared(shift); - } else if (sd_version_is_qwen_image(version)) { - LOG_INFO("running in FLOW mode"); - float shift = sd_ctx_params->flow_shift; - if (shift == INFINITY) { - shift = 3.0; - } - denoiser = std::make_shared(shift); - } else if (is_using_v_parameterization) { - LOG_INFO("running in v-prediction mode"); - denoiser = std::make_shared(); - } else if (is_using_edm_v_parameterization) { - LOG_INFO("running in v-prediction EDM mode"); - denoiser = std::make_shared(); } else { - LOG_INFO("running in eps-prediction mode"); + if (sd_version_is_sd2(version)) { + // check is_using_v_parameterization_for_sd2 + if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) { + is_using_v_parameterization = true; + } + } else if (sd_version_is_sdxl(version)) { + if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) { + // CosXL models + // TODO: get sigma_min and sigma_max values from file + is_using_edm_v_parameterization = true; + } + if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) { + is_using_v_parameterization = true; + } + } else if (version == VERSION_SVD) { + // TODO: V_PREDICTION_EDM + is_using_v_parameterization = true; + } + + if (sd_version_is_sd3(version)) { + LOG_INFO("running in FLOW mode"); + float shift = sd_ctx_params->flow_shift; + if (shift == INFINITY) { + shift = 3.0; + } + denoiser = std::make_shared(shift); + } else if (sd_version_is_flux(version)) { + LOG_INFO("running in Flux FLOW mode"); + float shift = 1.0f; // TODO: validate + for (auto pair : model_loader.tensor_storages_types) { + if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) { + shift = 1.15f; + break; + } + } + denoiser = std::make_shared(shift); + } else if (sd_version_is_wan(version)) { + LOG_INFO("running in FLOW mode"); + float shift = sd_ctx_params->flow_shift; + if (shift == INFINITY) { + shift = 5.0; + } + denoiser = std::make_shared(shift); + } else if (sd_version_is_qwen_image(version)) { + LOG_INFO("running in FLOW mode"); + float shift = sd_ctx_params->flow_shift; + if (shift == INFINITY) { + shift = 3.0; + } + denoiser = std::make_shared(shift); + } else if (is_using_v_parameterization) { + LOG_INFO("running in v-prediction mode"); + denoiser = std::make_shared(); + } else if (is_using_edm_v_parameterization) { + LOG_INFO("running in v-prediction EDM mode"); + denoiser = std::make_shared(); + } else { + LOG_INFO("running in eps-prediction mode"); + } } auto comp_vis_denoiser = std::dynamic_pointer_cast(denoiser); @@ -1742,6 +1780,31 @@ enum scheduler_t str_to_schedule(const char* str) { return SCHEDULE_COUNT; } +const char* prediction_to_str[] = { + "default", + "eps", + "v", + "edm_v", + "sd3_flow", + "flux_flow", +}; + +const char* sd_prediction_name(enum prediction_t prediction) { + if (prediction < PREDICTION_COUNT) { + return prediction_to_str[prediction]; + } + return NONE_STR; +} + +enum prediction_t str_to_prediction(const char* str) { + for (int i = 0; i < PREDICTION_COUNT; i++) { + if (!strcmp(str, prediction_to_str[i])) { + return (enum prediction_t)i; + } + } + return PREDICTION_COUNT; +} + void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { *sd_ctx_params = {}; sd_ctx_params->vae_decode_only = true; @@ -1749,6 +1812,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->n_threads = get_num_physical_cores(); sd_ctx_params->wtype = SD_TYPE_COUNT; sd_ctx_params->rng_type = CUDA_RNG; + sd_ctx_params->prediction = DEFAULT_PRED; sd_ctx_params->offload_params_to_cpu = false; sd_ctx_params->keep_clip_on_cpu = false; sd_ctx_params->keep_control_net_on_cpu = false; @@ -1788,6 +1852,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "n_threads: %d\n" "wtype: %s\n" "rng_type: %s\n" + "prediction: %s\n" "offload_params_to_cpu: %s\n" "keep_clip_on_cpu: %s\n" "keep_control_net_on_cpu: %s\n" @@ -1816,6 +1881,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->n_threads, sd_type_name(sd_ctx_params->wtype), sd_rng_type_name(sd_ctx_params->rng_type), + sd_prediction_name(sd_ctx_params->prediction), BOOL_STR(sd_ctx_params->offload_params_to_cpu), BOOL_STR(sd_ctx_params->keep_clip_on_cpu), BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), diff --git a/stable-diffusion.h b/stable-diffusion.h index 1d3ed85..134bb39 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -64,6 +64,16 @@ enum scheduler_t { SCHEDULE_COUNT }; +enum prediction_t { + DEFAULT_PRED, + EPS_PRED, + V_PRED, + EDM_V_PRED, + SD3_FLOW_PRED, + FLUX_FLOW_PRED, + PREDICTION_COUNT +}; + // same as enum ggml_type enum sd_type_t { SD_TYPE_F32 = 0, @@ -146,6 +156,7 @@ typedef struct { int n_threads; enum sd_type_t wtype; enum rng_type_t rng_type; + enum prediction_t prediction; bool offload_params_to_cpu; bool keep_clip_on_cpu; bool keep_control_net_on_cpu; @@ -255,6 +266,8 @@ SD_API const char* sd_sample_method_name(enum sample_method_t sample_method); SD_API enum sample_method_t str_to_sample_method(const char* str); SD_API const char* sd_schedule_name(enum scheduler_t scheduler); SD_API enum scheduler_t str_to_schedule(const char* str); +SD_API const char* sd_prediction_name(enum prediction_t prediction); +SD_API enum prediction_t str_to_prediction(const char* str); SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params); SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);