Compare commits

...

3 Commits

Author SHA1 Message Date
leejet
40a6a8710e
fix: resolve precision issues in SDXL VAE under fp16 (#888)
* fix: resolve precision issues in SDXL VAE under fp16

* add --force-sdxl-vae-conv-scale option

* update docs
2025-10-15 23:01:00 +08:00
Daniele
e3702585cb
feat: added prediction argument (#334) 2025-10-15 23:00:10 +08:00
cmdr2
a7d6d296c7
chore: allow building ggml as a separate shared lib (#468) 2025-10-15 22:10:26 +08:00
9 changed files with 228 additions and 101 deletions

View File

@ -33,6 +33,7 @@ option(SD_SYCL "sd: sycl backend" OFF)
option(SD_MUSA "sd: musa backend" OFF) option(SD_MUSA "sd: musa backend" OFF)
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF) option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF) option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF)
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF) option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON) #option(SD_BUILD_SERVER "sd: build server example" ON)
@ -86,18 +87,21 @@ file(GLOB SD_LIB_SOURCES
"*.hpp" "*.hpp"
) )
# we can get only one share lib
if(SD_BUILD_SHARED_LIBS) if(SD_BUILD_SHARED_LIBS)
message("-- Build shared library") message("-- Build shared library")
message(${SD_LIB_SOURCES}) message(${SD_LIB_SOURCES})
set(BUILD_SHARED_LIBS OFF) if(NOT SD_BUILD_SHARED_GGML_LIB)
set(BUILD_SHARED_LIBS OFF)
endif()
add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES}) add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
add_definitions(-DSD_BUILD_SHARED_LIB) add_definitions(-DSD_BUILD_SHARED_LIB)
target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL) target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON)
else() else()
message("-- Build static library") message("-- Build static library")
set(BUILD_SHARED_LIBS OFF) if(NOT SD_BUILD_SHARED_GGML_LIB)
set(BUILD_SHARED_LIBS OFF)
endif()
add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES}) add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
endif() endif()

View File

@ -17,7 +17,6 @@ API and command-line option may change frequently.***
- Image Models - Image Models
- SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
- !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
- [SD3/SD3.5](./docs/sd3.md) - [SD3/SD3.5](./docs/sd3.md)
- [Flux-dev/Flux-schnell](./docs/flux.md) - [Flux-dev/Flux-schnell](./docs/flux.md)
- [Chroma](./docs/chroma.md) - [Chroma](./docs/chroma.md)
@ -358,12 +357,14 @@ arguments:
--rng {std_default, cuda} RNG (default: cuda) --rng {std_default, cuda} RNG (default: cuda)
-s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0) -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
-b, --batch-count COUNT number of images to generate -b, --batch-count COUNT number of images to generate
--prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override
--clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1) --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
--vae-tiling process vae in tiles to reduce memory usage --vae-tiling process vae in tiles to reduce memory usage
--vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32) --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)
--vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size) --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
--vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5) --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--vae-on-cpu keep vae in cpu (for low vram) --vae-on-cpu keep vae in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model (for low vram) --diffusion-fa use flash attention in the diffusion model (for low vram)

View File

@ -1457,7 +1457,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
const ConditionerParams& conditioner_params) { const ConditionerParams& conditioner_params) {
std::string prompt; std::string prompt;
std::vector<std::pair<int, ggml_tensor*>> image_embeds; std::vector<std::pair<int, ggml_tensor*>> image_embeds;
size_t system_prompt_length = 0; size_t system_prompt_length = 0;
int prompt_template_encode_start_idx = 34; int prompt_template_encode_start_idx = 34;
if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) { if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) {
LOG_INFO("QwenImageEditPlusPipeline"); LOG_INFO("QwenImageEditPlusPipeline");

View File

@ -84,6 +84,7 @@ struct SDParams {
std::string prompt; std::string prompt;
std::string negative_prompt; std::string negative_prompt;
int clip_skip = -1; // <= 0 represents unspecified int clip_skip = -1; // <= 0 represents unspecified
int width = 512; int width = 512;
int height = 512; int height = 512;
@ -127,7 +128,10 @@ struct SDParams {
int chroma_t5_mask_pad = 1; int chroma_t5_mask_pad = 1;
float flow_shift = INFINITY; float flow_shift = INFINITY;
prediction_t prediction = DEFAULT_PRED;
sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
bool force_sdxl_vae_conv_scale = false;
SDParams() { SDParams() {
sd_sample_params_init(&sample_params); sd_sample_params_init(&sample_params);
@ -188,12 +192,14 @@ void print_params(SDParams params) {
printf(" sample_params: %s\n", SAFE_STR(sample_params_str)); printf(" sample_params: %s\n", SAFE_STR(sample_params_str));
printf(" high_noise_sample_params: %s\n", SAFE_STR(high_noise_sample_params_str)); printf(" high_noise_sample_params: %s\n", SAFE_STR(high_noise_sample_params_str));
printf(" moe_boundary: %.3f\n", params.moe_boundary); printf(" moe_boundary: %.3f\n", params.moe_boundary);
printf(" prediction: %s\n", sd_prediction_name(params.prediction));
printf(" flow_shift: %.2f\n", params.flow_shift); printf(" flow_shift: %.2f\n", params.flow_shift);
printf(" strength(img2img): %.2f\n", params.strength); printf(" strength(img2img): %.2f\n", params.strength);
printf(" rng: %s\n", sd_rng_type_name(params.rng_type)); printf(" rng: %s\n", sd_rng_type_name(params.rng_type));
printf(" seed: %zd\n", params.seed); printf(" seed: %zd\n", params.seed);
printf(" batch_count: %d\n", params.batch_count); printf(" batch_count: %d\n", params.batch_count);
printf(" vae_tiling: %s\n", params.vae_tiling_params.enabled ? "true" : "false"); printf(" vae_tiling: %s\n", params.vae_tiling_params.enabled ? "true" : "false");
printf(" force_sdxl_vae_conv_scale: %s\n", params.force_sdxl_vae_conv_scale ? "true" : "false");
printf(" upscale_repeats: %d\n", params.upscale_repeats); printf(" upscale_repeats: %d\n", params.upscale_repeats);
printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false"); printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false");
printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false"); printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false");
@ -281,12 +287,14 @@ void print_usage(int argc, const char* argv[]) {
printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); printf(" --rng {std_default, cuda} RNG (default: cuda)\n");
printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n"); printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n");
printf(" -b, --batch-count COUNT number of images to generate\n"); printf(" -b, --batch-count COUNT number of images to generate\n");
printf(" --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override.\n");
printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n"); printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n"); printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
printf(" --vae-tiling process vae in tiles to reduce memory usage\n"); printf(" --vae-tiling process vae in tiles to reduce memory usage\n");
printf(" --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)\n"); printf(" --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)\n");
printf(" --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n"); printf(" --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n");
printf(" --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n"); printf(" --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n");
printf(" --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae\n");
printf(" --vae-on-cpu keep vae in cpu (for low vram)\n"); printf(" --vae-on-cpu keep vae in cpu (for low vram)\n");
printf(" --clip-on-cpu keep clip in cpu (for low vram)\n"); printf(" --clip-on-cpu keep clip in cpu (for low vram)\n");
printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n"); printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n");
@ -557,6 +565,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
options.bool_options = { options.bool_options = {
{"", "--vae-tiling", "", true, &params.vae_tiling_params.enabled}, {"", "--vae-tiling", "", true, &params.vae_tiling_params.enabled},
{"", "--force-sdxl-vae-conv-scale", "", true, &params.force_sdxl_vae_conv_scale},
{"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu}, {"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu},
{"", "--control-net-cpu", "", true, &params.control_net_cpu}, {"", "--control-net-cpu", "", true, &params.control_net_cpu},
{"", "--clip-on-cpu", "", true, &params.clip_on_cpu}, {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
@ -651,6 +660,20 @@ void parse_args(int argc, const char** argv, SDParams& params) {
return 1; return 1;
}; };
auto on_prediction_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
}
const char* arg = argv[index];
params.prediction = str_to_prediction(arg);
if (params.prediction == PREDICTION_COUNT) {
fprintf(stderr, "error: invalid prediction type %s\n",
arg);
return -1;
}
return 1;
};
auto on_sample_method_arg = [&](int argc, const char** argv, int index) { auto on_sample_method_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) { if (++index >= argc) {
return -1; return -1;
@ -807,6 +830,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
{"", "--rng", "", on_rng_arg}, {"", "--rng", "", on_rng_arg},
{"-s", "--seed", "", on_seed_arg}, {"-s", "--seed", "", on_seed_arg},
{"", "--sampling-method", "", on_sample_method_arg}, {"", "--sampling-method", "", on_sample_method_arg},
{"", "--prediction", "", on_prediction_arg},
{"", "--scheduler", "", on_schedule_arg}, {"", "--scheduler", "", on_schedule_arg},
{"", "--skip-layers", "", on_skip_layers_arg}, {"", "--skip-layers", "", on_skip_layers_arg},
{"", "--high-noise-sampling-method", "", on_high_noise_sample_method_arg}, {"", "--high-noise-sampling-method", "", on_high_noise_sample_method_arg},
@ -1354,6 +1378,7 @@ int main(int argc, const char* argv[]) {
params.n_threads, params.n_threads,
params.wtype, params.wtype,
params.rng_type, params.rng_type,
params.prediction,
params.offload_params_to_cpu, params.offload_params_to_cpu,
params.clip_on_cpu, params.clip_on_cpu,
params.control_net_cpu, params.control_net_cpu,
@ -1361,6 +1386,7 @@ int main(int argc, const char* argv[]) {
params.diffusion_flash_attn, params.diffusion_flash_attn,
params.diffusion_conv_direct, params.diffusion_conv_direct,
params.vae_conv_direct, params.vae_conv_direct,
params.force_sdxl_vae_conv_scale,
params.chroma_use_dit_mask, params.chroma_use_dit_mask,
params.chroma_use_t5_mask, params.chroma_use_t5_mask,
params.chroma_t5_mask_pad, params.chroma_t5_mask_pad,

View File

@ -975,38 +975,28 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* w, struct ggml_tensor* w,
struct ggml_tensor* b, struct ggml_tensor* b,
int s0 = 1, int s0 = 1,
int s1 = 1, int s1 = 1,
int p0 = 0, int p0 = 0,
int p1 = 0, int p1 = 0,
int d0 = 1, int d0 = 1,
int d1 = 1) { int d1 = 1,
x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1); bool direct = false,
if (b != NULL) { float scale = 1.f) {
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); if (scale != 1.f) {
// b = ggml_repeat(ctx, b, x); x = ggml_scale(ctx, x, scale);
x = ggml_add_inplace(ctx, x, b); }
if (direct) {
x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
} else {
x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
}
if (scale != 1.f) {
x = ggml_scale(ctx, x, 1.f / scale);
} }
return x;
}
// w: [OC*IC, KD, KH, KW]
// x: [N*IC, ID, IH, IW]
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* w,
struct ggml_tensor* b,
int s0 = 1,
int s1 = 1,
int p0 = 0,
int p1 = 0,
int d0 = 1,
int d1 = 1) {
x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
if (b != NULL) { if (b != NULL) {
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
// b = ggml_repeat(ctx, b, x); x = ggml_add_inplace(ctx, x, b);
x = ggml_add(ctx, x, b);
} }
return x; return x;
} }
@ -2067,6 +2057,7 @@ protected:
std::pair<int, int> dilation; std::pair<int, int> dilation;
bool bias; bool bias;
bool direct = false; bool direct = false;
float scale = 1.f;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") { void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
enum ggml_type wtype = GGML_TYPE_F16; enum ggml_type wtype = GGML_TYPE_F16;
@ -2097,6 +2088,10 @@ public:
direct = true; direct = true;
} }
void set_scale(float scale_value) {
scale = scale_value;
}
std::string get_desc() { std::string get_desc() {
return "Conv2d"; return "Conv2d";
} }
@ -2107,11 +2102,18 @@ public:
if (bias) { if (bias) {
b = params["bias"]; b = params["bias"];
} }
if (direct) { return ggml_nn_conv_2d(ctx,
return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); x,
} else { w,
return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); b,
} stride.second,
stride.first,
padding.second,
padding.first,
dilation.second,
dilation.first,
direct,
scale);
} }
}; };

View File

@ -535,7 +535,7 @@ namespace Qwen {
} }
} }
LOG_ERROR("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers); LOG_ERROR("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
qwen_image = QwenImageModel(qwen_image_params); qwen_image = QwenImageModel(qwen_image_params);
qwen_image.init(params_ctx, tensor_types, prefix); qwen_image.init(params_ctx, tensor_types, prefix);
} }

View File

@ -330,13 +330,6 @@ public:
if (sd_version_is_sdxl(version)) { if (sd_version_is_sdxl(version)) {
scale_factor = 0.13025f; scale_factor = 0.13025f;
if (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 && strlen(SAFE_STR(sd_ctx_params->taesd_path)) == 0) {
LOG_WARN(
"!!!It looks like you are using SDXL model. "
"If you find that the generated images are completely black, "
"try specifying SDXL VAE FP16 Fix with the --vae parameter. "
"You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors");
}
} else if (sd_version_is_sd3(version)) { } else if (sd_version_is_sd3(version)) {
scale_factor = 1.5305f; scale_factor = 1.5305f;
} else if (sd_version_is_flux(version)) { } else if (sd_version_is_flux(version)) {
@ -517,6 +510,15 @@ public:
LOG_INFO("Using Conv2d direct in the vae model"); LOG_INFO("Using Conv2d direct in the vae model");
first_stage_model->enable_conv2d_direct(); first_stage_model->enable_conv2d_direct();
} }
if (version == VERSION_SDXL &&
(strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 || sd_ctx_params->force_sdxl_vae_conv_scale)) {
float vae_conv_2d_scale = 1.f / 32.f;
LOG_WARN(
"No VAE specified with --vae or --force-sdxl-vae-conv-scale flag set, "
"using Conv2D scale %.3f",
vae_conv_2d_scale);
first_stage_model->set_conv2d_scale(vae_conv_2d_scale);
}
first_stage_model->alloc_params_buffer(); first_stage_model->alloc_params_buffer();
first_stage_model->get_param_tensors(tensors, "first_stage_model"); first_stage_model->get_param_tensors(tensors, "first_stage_model");
} else { } else {
@ -700,64 +702,102 @@ public:
ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM"); ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
} }
// check is_using_v_parameterization_for_sd2 if (sd_ctx_params->prediction != DEFAULT_PRED) {
if (sd_version_is_sd2(version)) { switch (sd_ctx_params->prediction) {
if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) { case EPS_PRED:
is_using_v_parameterization = true; LOG_INFO("running in eps-prediction mode");
} break;
} else if (sd_version_is_sdxl(version)) { case V_PRED:
if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) { LOG_INFO("running in v-prediction mode");
// CosXL models denoiser = std::make_shared<CompVisVDenoiser>();
// TODO: get sigma_min and sigma_max values from file break;
is_using_edm_v_parameterization = true; case EDM_V_PRED:
} LOG_INFO("running in v-prediction EDM mode");
if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) { denoiser = std::make_shared<EDMVDenoiser>();
is_using_v_parameterization = true; break;
} case SD3_FLOW_PRED: {
} else if (version == VERSION_SVD) { LOG_INFO("running in FLOW mode");
// TODO: V_PREDICTION_EDM float shift = sd_ctx_params->flow_shift;
is_using_v_parameterization = true; if (shift == INFINITY) {
} shift = 3.0;
}
if (sd_version_is_sd3(version)) { denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
LOG_INFO("running in FLOW mode");
float shift = sd_ctx_params->flow_shift;
if (shift == INFINITY) {
shift = 3.0;
}
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
} else if (sd_version_is_flux(version)) {
LOG_INFO("running in Flux FLOW mode");
float shift = 1.0f; // TODO: validate
for (auto pair : model_loader.tensor_storages_types) {
if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
shift = 1.15f;
break; break;
} }
case FLUX_FLOW_PRED: {
LOG_INFO("running in Flux FLOW mode");
float shift = sd_ctx_params->flow_shift;
if (shift == INFINITY) {
shift = 3.0;
}
denoiser = std::make_shared<FluxFlowDenoiser>(shift);
break;
}
default: {
LOG_ERROR("Unknown parametrization %i", sd_ctx_params->prediction);
return false;
}
} }
denoiser = std::make_shared<FluxFlowDenoiser>(shift);
} else if (sd_version_is_wan(version)) {
LOG_INFO("running in FLOW mode");
float shift = sd_ctx_params->flow_shift;
if (shift == INFINITY) {
shift = 5.0;
}
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
} else if (sd_version_is_qwen_image(version)) {
LOG_INFO("running in FLOW mode");
float shift = sd_ctx_params->flow_shift;
if (shift == INFINITY) {
shift = 3.0;
}
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
} else if (is_using_v_parameterization) {
LOG_INFO("running in v-prediction mode");
denoiser = std::make_shared<CompVisVDenoiser>();
} else if (is_using_edm_v_parameterization) {
LOG_INFO("running in v-prediction EDM mode");
denoiser = std::make_shared<EDMVDenoiser>();
} else { } else {
LOG_INFO("running in eps-prediction mode"); if (sd_version_is_sd2(version)) {
// check is_using_v_parameterization_for_sd2
if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
is_using_v_parameterization = true;
}
} else if (sd_version_is_sdxl(version)) {
if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) {
// CosXL models
// TODO: get sigma_min and sigma_max values from file
is_using_edm_v_parameterization = true;
}
if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) {
is_using_v_parameterization = true;
}
} else if (version == VERSION_SVD) {
// TODO: V_PREDICTION_EDM
is_using_v_parameterization = true;
}
if (sd_version_is_sd3(version)) {
LOG_INFO("running in FLOW mode");
float shift = sd_ctx_params->flow_shift;
if (shift == INFINITY) {
shift = 3.0;
}
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
} else if (sd_version_is_flux(version)) {
LOG_INFO("running in Flux FLOW mode");
float shift = 1.0f; // TODO: validate
for (auto pair : model_loader.tensor_storages_types) {
if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
shift = 1.15f;
break;
}
}
denoiser = std::make_shared<FluxFlowDenoiser>(shift);
} else if (sd_version_is_wan(version)) {
LOG_INFO("running in FLOW mode");
float shift = sd_ctx_params->flow_shift;
if (shift == INFINITY) {
shift = 5.0;
}
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
} else if (sd_version_is_qwen_image(version)) {
LOG_INFO("running in FLOW mode");
float shift = sd_ctx_params->flow_shift;
if (shift == INFINITY) {
shift = 3.0;
}
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
} else if (is_using_v_parameterization) {
LOG_INFO("running in v-prediction mode");
denoiser = std::make_shared<CompVisVDenoiser>();
} else if (is_using_edm_v_parameterization) {
LOG_INFO("running in v-prediction EDM mode");
denoiser = std::make_shared<EDMVDenoiser>();
} else {
LOG_INFO("running in eps-prediction mode");
}
} }
auto comp_vis_denoiser = std::dynamic_pointer_cast<CompVisDenoiser>(denoiser); auto comp_vis_denoiser = std::dynamic_pointer_cast<CompVisDenoiser>(denoiser);
@ -1742,6 +1782,31 @@ enum scheduler_t str_to_schedule(const char* str) {
return SCHEDULE_COUNT; return SCHEDULE_COUNT;
} }
const char* prediction_to_str[] = {
"default",
"eps",
"v",
"edm_v",
"sd3_flow",
"flux_flow",
};
const char* sd_prediction_name(enum prediction_t prediction) {
if (prediction < PREDICTION_COUNT) {
return prediction_to_str[prediction];
}
return NONE_STR;
}
enum prediction_t str_to_prediction(const char* str) {
for (int i = 0; i < PREDICTION_COUNT; i++) {
if (!strcmp(str, prediction_to_str[i])) {
return (enum prediction_t)i;
}
}
return PREDICTION_COUNT;
}
void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
*sd_ctx_params = {}; *sd_ctx_params = {};
sd_ctx_params->vae_decode_only = true; sd_ctx_params->vae_decode_only = true;
@ -1749,6 +1814,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
sd_ctx_params->n_threads = get_num_physical_cores(); sd_ctx_params->n_threads = get_num_physical_cores();
sd_ctx_params->wtype = SD_TYPE_COUNT; sd_ctx_params->wtype = SD_TYPE_COUNT;
sd_ctx_params->rng_type = CUDA_RNG; sd_ctx_params->rng_type = CUDA_RNG;
sd_ctx_params->prediction = DEFAULT_PRED;
sd_ctx_params->offload_params_to_cpu = false; sd_ctx_params->offload_params_to_cpu = false;
sd_ctx_params->keep_clip_on_cpu = false; sd_ctx_params->keep_clip_on_cpu = false;
sd_ctx_params->keep_control_net_on_cpu = false; sd_ctx_params->keep_control_net_on_cpu = false;
@ -1788,6 +1854,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
"n_threads: %d\n" "n_threads: %d\n"
"wtype: %s\n" "wtype: %s\n"
"rng_type: %s\n" "rng_type: %s\n"
"prediction: %s\n"
"offload_params_to_cpu: %s\n" "offload_params_to_cpu: %s\n"
"keep_clip_on_cpu: %s\n" "keep_clip_on_cpu: %s\n"
"keep_control_net_on_cpu: %s\n" "keep_control_net_on_cpu: %s\n"
@ -1816,6 +1883,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
sd_ctx_params->n_threads, sd_ctx_params->n_threads,
sd_type_name(sd_ctx_params->wtype), sd_type_name(sd_ctx_params->wtype),
sd_rng_type_name(sd_ctx_params->rng_type), sd_rng_type_name(sd_ctx_params->rng_type),
sd_prediction_name(sd_ctx_params->prediction),
BOOL_STR(sd_ctx_params->offload_params_to_cpu), BOOL_STR(sd_ctx_params->offload_params_to_cpu),
BOOL_STR(sd_ctx_params->keep_clip_on_cpu), BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),

View File

@ -64,6 +64,16 @@ enum scheduler_t {
SCHEDULE_COUNT SCHEDULE_COUNT
}; };
enum prediction_t {
DEFAULT_PRED,
EPS_PRED,
V_PRED,
EDM_V_PRED,
SD3_FLOW_PRED,
FLUX_FLOW_PRED,
PREDICTION_COUNT
};
// same as enum ggml_type // same as enum ggml_type
enum sd_type_t { enum sd_type_t {
SD_TYPE_F32 = 0, SD_TYPE_F32 = 0,
@ -146,6 +156,7 @@ typedef struct {
int n_threads; int n_threads;
enum sd_type_t wtype; enum sd_type_t wtype;
enum rng_type_t rng_type; enum rng_type_t rng_type;
enum prediction_t prediction;
bool offload_params_to_cpu; bool offload_params_to_cpu;
bool keep_clip_on_cpu; bool keep_clip_on_cpu;
bool keep_control_net_on_cpu; bool keep_control_net_on_cpu;
@ -153,6 +164,7 @@ typedef struct {
bool diffusion_flash_attn; bool diffusion_flash_attn;
bool diffusion_conv_direct; bool diffusion_conv_direct;
bool vae_conv_direct; bool vae_conv_direct;
bool force_sdxl_vae_conv_scale;
bool chroma_use_dit_mask; bool chroma_use_dit_mask;
bool chroma_use_t5_mask; bool chroma_use_t5_mask;
int chroma_t5_mask_pad; int chroma_t5_mask_pad;
@ -255,6 +267,8 @@ SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
SD_API enum sample_method_t str_to_sample_method(const char* str); SD_API enum sample_method_t str_to_sample_method(const char* str);
SD_API const char* sd_schedule_name(enum scheduler_t scheduler); SD_API const char* sd_schedule_name(enum scheduler_t scheduler);
SD_API enum scheduler_t str_to_schedule(const char* str); SD_API enum scheduler_t str_to_schedule(const char* str);
SD_API const char* sd_prediction_name(enum prediction_t prediction);
SD_API enum prediction_t str_to_prediction(const char* str);
SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params); SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);

12
vae.hpp
View File

@ -530,6 +530,7 @@ struct VAE : public GGMLRunner {
struct ggml_context* output_ctx) = 0; struct ggml_context* output_ctx) = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0; virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
virtual void enable_conv2d_direct(){}; virtual void enable_conv2d_direct(){};
virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
}; };
struct AutoEncoderKL : public VAE { struct AutoEncoderKL : public VAE {
@ -558,6 +559,17 @@ struct AutoEncoderKL : public VAE {
} }
} }
void set_conv2d_scale(float scale) {
std::vector<GGMLBlock*> blocks;
ae.get_all_blocks(blocks);
for (auto block : blocks) {
if (block->get_desc() == "Conv2d") {
auto conv_block = (Conv2d*)block;
conv_block->set_scale(scale);
}
}
}
std::string get_desc() { std::string get_desc() {
return "vae"; return "vae";
} }