fix: resolve precision issues in SDXL VAE under fp16 (#888 )

* fix: resolve precision issues in SDXL VAE under fp16 * add --force-sdxl-vae-conv-scale option * update docs
feat: added prediction argument (#334 )
2026-06-25 15:46:40 +00:00 · 2025-10-15 23:01:00 +08:00 · 2025-10-15 23:00:10 +08:00 · 2025-10-15 22:10:26 +08:00
9 changed files with 228 additions and 101 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -33,6 +33,7 @@ option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_MUSA                       "sd: musa backend" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
+option(SD_BUILD_SHARED_GGML_LIB      "sd: build ggml as a separate shared lib" OFF)
 option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)

@ -86,18 +87,21 @@ file(GLOB SD_LIB_SOURCES
    "*.hpp"
 )

-# we can get only one share lib
 if(SD_BUILD_SHARED_LIBS)
    message("-- Build shared library")
    message(${SD_LIB_SOURCES})
-    set(BUILD_SHARED_LIBS OFF)
+    if(NOT SD_BUILD_SHARED_GGML_LIB)
+        set(BUILD_SHARED_LIBS OFF)
+    endif()
    add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
    add_definitions(-DSD_BUILD_SHARED_LIB)
    target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 else()
    message("-- Build static library")
-    set(BUILD_SHARED_LIBS OFF)
+    if(NOT SD_BUILD_SHARED_GGML_LIB)
+        set(BUILD_SHARED_LIBS OFF)
+    endif()
    add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
 endif()

--- a/README.md
+++ b/README.md
@ -17,7 +17,6 @@ API and command-line option may change frequently.***
  - Image Models
    - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
    - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
-      - !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
    - [SD3/SD3.5](./docs/sd3.md)
    - [Flux-dev/Flux-schnell](./docs/flux.md)
    - [Chroma](./docs/chroma.md)
@ -358,12 +357,14 @@ arguments:
  --rng {std_default, cuda}          RNG (default: cuda)
  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
  -b, --batch-count COUNT            number of images to generate
+  --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override
  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
  --vae-tiling                       process vae in tiles to reduce memory usage
  --vae-tile-size [X]x[Y]            tile size for vae tiling (default: 32x32)
  --vae-relative-tile-size [X]x[Y]   relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
  --vae-tile-overlap OVERLAP         tile overlap for vae tiling, in fraction of tile size (default: 0.5)
+  --force-sdxl-vae-conv-scale        force use of conv scale on sdxl vae
  --vae-on-cpu                       keep vae in cpu (for low vram)
  --clip-on-cpu                      keep clip in cpu (for low vram)
  --diffusion-fa                     use flash attention in the diffusion model (for low vram)
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -1457,7 +1457,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
                                      const ConditionerParams& conditioner_params) {
        std::string prompt;
        std::vector<std::pair<int, ggml_tensor*>> image_embeds;
-        size_t system_prompt_length = 0;
+        size_t system_prompt_length          = 0;
        int prompt_template_encode_start_idx = 34;
        if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) {
            LOG_INFO("QwenImageEditPlusPipeline");
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -84,6 +84,7 @@ struct SDParams {

    std::string prompt;
    std::string negative_prompt;
+
    int clip_skip   = -1;  // <= 0 represents unspecified
    int width       = 512;
    int height      = 512;
@ -127,7 +128,10 @@ struct SDParams {
    int chroma_t5_mask_pad   = 1;
    float flow_shift         = INFINITY;

+    prediction_t prediction = DEFAULT_PRED;
+
    sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
+    bool force_sdxl_vae_conv_scale       = false;

    SDParams() {
        sd_sample_params_init(&sample_params);
@ -188,12 +192,14 @@ void print_params(SDParams params) {
    printf("    sample_params:                     %s\n", SAFE_STR(sample_params_str));
    printf("    high_noise_sample_params:          %s\n", SAFE_STR(high_noise_sample_params_str));
    printf("    moe_boundary:                      %.3f\n", params.moe_boundary);
+    printf("    prediction:                        %s\n", sd_prediction_name(params.prediction));
    printf("    flow_shift:                        %.2f\n", params.flow_shift);
    printf("    strength(img2img):                 %.2f\n", params.strength);
    printf("    rng:                               %s\n", sd_rng_type_name(params.rng_type));
    printf("    seed:                              %zd\n", params.seed);
    printf("    batch_count:                       %d\n", params.batch_count);
    printf("    vae_tiling:                        %s\n", params.vae_tiling_params.enabled ? "true" : "false");
+    printf("    force_sdxl_vae_conv_scale:         %s\n", params.force_sdxl_vae_conv_scale ? "true" : "false");
    printf("    upscale_repeats:                   %d\n", params.upscale_repeats);
    printf("    chroma_use_dit_mask:               %s\n", params.chroma_use_dit_mask ? "true" : "false");
    printf("    chroma_use_t5_mask:                %s\n", params.chroma_use_t5_mask ? "true" : "false");
@ -281,12 +287,14 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
    printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
    printf("  -b, --batch-count COUNT            number of images to generate\n");
+    printf("  --prediction {eps, v, edm_v, sd3_flow, flux_flow}        Prediction type override.\n");
    printf("  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
    printf("                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
    printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
    printf("  --vae-tile-size [X]x[Y]            tile size for vae tiling (default: 32x32)\n");
    printf("  --vae-relative-tile-size [X]x[Y]   relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n");
    printf("  --vae-tile-overlap OVERLAP         tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n");
+    printf("  --force-sdxl-vae-conv-scale        force use of conv scale on sdxl vae\n");
    printf("  --vae-on-cpu                       keep vae in cpu (for low vram)\n");
    printf("  --clip-on-cpu                      keep clip in cpu (for low vram)\n");
    printf("  --diffusion-fa                     use flash attention in the diffusion model (for low vram)\n");
@ -557,6 +565,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {

    options.bool_options = {
        {"", "--vae-tiling", "", true, &params.vae_tiling_params.enabled},
+        {"", "--force-sdxl-vae-conv-scale", "", true, &params.force_sdxl_vae_conv_scale},
        {"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu},
        {"", "--control-net-cpu", "", true, &params.control_net_cpu},
        {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
@ -651,6 +660,20 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        return 1;
    };

+    auto on_prediction_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg   = argv[index];
+        params.prediction = str_to_prediction(arg);
+        if (params.prediction == PREDICTION_COUNT) {
+            fprintf(stderr, "error: invalid prediction type %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
    auto on_sample_method_arg = [&](int argc, const char** argv, int index) {
        if (++index >= argc) {
            return -1;
@ -807,6 +830,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        {"", "--rng", "", on_rng_arg},
        {"-s", "--seed", "", on_seed_arg},
        {"", "--sampling-method", "", on_sample_method_arg},
+        {"", "--prediction", "", on_prediction_arg},
        {"", "--scheduler", "", on_schedule_arg},
        {"", "--skip-layers", "", on_skip_layers_arg},
        {"", "--high-noise-sampling-method", "", on_high_noise_sample_method_arg},
@ -1354,6 +1378,7 @@ int main(int argc, const char* argv[]) {
        params.n_threads,
        params.wtype,
        params.rng_type,
+        params.prediction,
        params.offload_params_to_cpu,
        params.clip_on_cpu,
        params.control_net_cpu,
@ -1361,6 +1386,7 @@ int main(int argc, const char* argv[]) {
        params.diffusion_flash_attn,
        params.diffusion_conv_direct,
        params.vae_conv_direct,
+        params.force_sdxl_vae_conv_scale,
        params.chroma_use_dit_mask,
        params.chroma_use_t5_mask,
        params.chroma_t5_mask_pad,
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -975,38 +975,28 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
                                                      struct ggml_tensor* x,
                                                      struct ggml_tensor* w,
                                                      struct ggml_tensor* b,
-                                                      int s0 = 1,
-                                                      int s1 = 1,
-                                                      int p0 = 0,
-                                                      int p1 = 0,
-                                                      int d0 = 1,
-                                                      int d1 = 1) {
-    x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
-    if (b != NULL) {
-        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
-        // b = ggml_repeat(ctx, b, x);
-        x = ggml_add_inplace(ctx, x, b);
+                                                      int s0      = 1,
+                                                      int s1      = 1,
+                                                      int p0      = 0,
+                                                      int p1      = 0,
+                                                      int d0      = 1,
+                                                      int d1      = 1,
+                                                      bool direct = false,
+                                                      float scale = 1.f) {
+    if (scale != 1.f) {
+        x = ggml_scale(ctx, x, scale);
+    }
+    if (direct) {
+        x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
+    } else {
+        x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
+    }
+    if (scale != 1.f) {
+        x = ggml_scale(ctx, x, 1.f / scale);
    }
-    return x;
-}
-
-// w: [OC*IC, KD, KH, KW]
-// x: [N*IC, ID, IH, IW]
-__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx,
-                                                             struct ggml_tensor* x,
-                                                             struct ggml_tensor* w,
-                                                             struct ggml_tensor* b,
-                                                             int s0 = 1,
-                                                             int s1 = 1,
-                                                             int p0 = 0,
-                                                             int p1 = 0,
-                                                             int d0 = 1,
-                                                             int d1 = 1) {
-    x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
    if (b != NULL) {
        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
-        // b = ggml_repeat(ctx, b, x);
-        x = ggml_add(ctx, x, b);
+        x = ggml_add_inplace(ctx, x, b);
    }
    return x;
 }
@ -2067,6 +2057,7 @@ protected:
    std::pair<int, int> dilation;
    bool bias;
    bool direct = false;
+    float scale = 1.f;

    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
        enum ggml_type wtype = GGML_TYPE_F16;
@ -2097,6 +2088,10 @@ public:
        direct = true;
    }

+    void set_scale(float scale_value) {
+        scale = scale_value;
+    }
+
    std::string get_desc() {
        return "Conv2d";
    }
@ -2107,11 +2102,18 @@ public:
        if (bias) {
            b = params["bias"];
        }
-        if (direct) {
-            return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
-        } else {
-            return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
-        }
+        return ggml_nn_conv_2d(ctx,
+                               x,
+                               w,
+                               b,
+                               stride.second,
+                               stride.first,
+                               padding.second,
+                               padding.first,
+                               dilation.second,
+                               dilation.first,
+                               direct,
+                               scale);
    }
 };

--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@ -535,7 +535,7 @@ namespace Qwen {
                }
            }
            LOG_ERROR("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
-            qwen_image                   = QwenImageModel(qwen_image_params);
+            qwen_image = QwenImageModel(qwen_image_params);
            qwen_image.init(params_ctx, tensor_types, prefix);
        }

--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -330,13 +330,6 @@ public:

        if (sd_version_is_sdxl(version)) {
            scale_factor = 0.13025f;
-            if (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 && strlen(SAFE_STR(sd_ctx_params->taesd_path)) == 0) {
-                LOG_WARN(
-                    "!!!It looks like you are using SDXL model. "
-                    "If you find that the generated images are completely black, "
-                    "try specifying SDXL VAE FP16 Fix with the --vae parameter. "
-                    "You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors");
-            }
        } else if (sd_version_is_sd3(version)) {
            scale_factor = 1.5305f;
        } else if (sd_version_is_flux(version)) {
@ -517,6 +510,15 @@ public:
                    LOG_INFO("Using Conv2d direct in the vae model");
                    first_stage_model->enable_conv2d_direct();
                }
+                if (version == VERSION_SDXL &&
+                    (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 || sd_ctx_params->force_sdxl_vae_conv_scale)) {
+                    float vae_conv_2d_scale = 1.f / 32.f;
+                    LOG_WARN(
+                        "No VAE specified with --vae or --force-sdxl-vae-conv-scale flag set, "
+                        "using Conv2D scale %.3f",
+                        vae_conv_2d_scale);
+                    first_stage_model->set_conv2d_scale(vae_conv_2d_scale);
+                }
                first_stage_model->alloc_params_buffer();
                first_stage_model->get_param_tensors(tensors, "first_stage_model");
            } else {
@ -700,64 +702,102 @@ public:
                ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
        }

-        // check is_using_v_parameterization_for_sd2
-        if (sd_version_is_sd2(version)) {
-            if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
-                is_using_v_parameterization = true;
-            }
-        } else if (sd_version_is_sdxl(version)) {
-            if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) {
-                // CosXL models
-                // TODO: get sigma_min and sigma_max values from file
-                is_using_edm_v_parameterization = true;
-            }
-            if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) {
-                is_using_v_parameterization = true;
-            }
-        } else if (version == VERSION_SVD) {
-            // TODO: V_PREDICTION_EDM
-            is_using_v_parameterization = true;
-        }
-
-        if (sd_version_is_sd3(version)) {
-            LOG_INFO("running in FLOW mode");
-            float shift = sd_ctx_params->flow_shift;
-            if (shift == INFINITY) {
-                shift = 3.0;
-            }
-            denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
-        } else if (sd_version_is_flux(version)) {
-            LOG_INFO("running in Flux FLOW mode");
-            float shift = 1.0f;  // TODO: validate
-            for (auto pair : model_loader.tensor_storages_types) {
-                if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
-                    shift = 1.15f;
+        if (sd_ctx_params->prediction != DEFAULT_PRED) {
+            switch (sd_ctx_params->prediction) {
+                case EPS_PRED:
+                    LOG_INFO("running in eps-prediction mode");
+                    break;
+                case V_PRED:
+                    LOG_INFO("running in v-prediction mode");
+                    denoiser = std::make_shared<CompVisVDenoiser>();
+                    break;
+                case EDM_V_PRED:
+                    LOG_INFO("running in v-prediction EDM mode");
+                    denoiser = std::make_shared<EDMVDenoiser>();
+                    break;
+                case SD3_FLOW_PRED: {
+                    LOG_INFO("running in FLOW mode");
+                    float shift = sd_ctx_params->flow_shift;
+                    if (shift == INFINITY) {
+                        shift = 3.0;
+                    }
+                    denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
                    break;
                }
+                case FLUX_FLOW_PRED: {
+                    LOG_INFO("running in Flux FLOW mode");
+                    float shift = sd_ctx_params->flow_shift;
+                    if (shift == INFINITY) {
+                        shift = 3.0;
+                    }
+                    denoiser = std::make_shared<FluxFlowDenoiser>(shift);
+                    break;
+                }
+                default: {
+                    LOG_ERROR("Unknown parametrization %i", sd_ctx_params->prediction);
+                    return false;
+                }
            }
-            denoiser = std::make_shared<FluxFlowDenoiser>(shift);
-        } else if (sd_version_is_wan(version)) {
-            LOG_INFO("running in FLOW mode");
-            float shift = sd_ctx_params->flow_shift;
-            if (shift == INFINITY) {
-                shift = 5.0;
-            }
-            denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
-        } else if (sd_version_is_qwen_image(version)) {
-            LOG_INFO("running in FLOW mode");
-            float shift = sd_ctx_params->flow_shift;
-            if (shift == INFINITY) {
-                shift = 3.0;
-            }
-            denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
-        } else if (is_using_v_parameterization) {
-            LOG_INFO("running in v-prediction mode");
-            denoiser = std::make_shared<CompVisVDenoiser>();
-        } else if (is_using_edm_v_parameterization) {
-            LOG_INFO("running in v-prediction EDM mode");
-            denoiser = std::make_shared<EDMVDenoiser>();
        } else {
-            LOG_INFO("running in eps-prediction mode");
+            if (sd_version_is_sd2(version)) {
+                // check is_using_v_parameterization_for_sd2
+                if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
+                    is_using_v_parameterization = true;
+                }
+            } else if (sd_version_is_sdxl(version)) {
+                if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) {
+                    // CosXL models
+                    // TODO: get sigma_min and sigma_max values from file
+                    is_using_edm_v_parameterization = true;
+                }
+                if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) {
+                    is_using_v_parameterization = true;
+                }
+            } else if (version == VERSION_SVD) {
+                // TODO: V_PREDICTION_EDM
+                is_using_v_parameterization = true;
+            }
+
+            if (sd_version_is_sd3(version)) {
+                LOG_INFO("running in FLOW mode");
+                float shift = sd_ctx_params->flow_shift;
+                if (shift == INFINITY) {
+                    shift = 3.0;
+                }
+                denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
+            } else if (sd_version_is_flux(version)) {
+                LOG_INFO("running in Flux FLOW mode");
+                float shift = 1.0f;  // TODO: validate
+                for (auto pair : model_loader.tensor_storages_types) {
+                    if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
+                        shift = 1.15f;
+                        break;
+                    }
+                }
+                denoiser = std::make_shared<FluxFlowDenoiser>(shift);
+            } else if (sd_version_is_wan(version)) {
+                LOG_INFO("running in FLOW mode");
+                float shift = sd_ctx_params->flow_shift;
+                if (shift == INFINITY) {
+                    shift = 5.0;
+                }
+                denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
+            } else if (sd_version_is_qwen_image(version)) {
+                LOG_INFO("running in FLOW mode");
+                float shift = sd_ctx_params->flow_shift;
+                if (shift == INFINITY) {
+                    shift = 3.0;
+                }
+                denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
+            } else if (is_using_v_parameterization) {
+                LOG_INFO("running in v-prediction mode");
+                denoiser = std::make_shared<CompVisVDenoiser>();
+            } else if (is_using_edm_v_parameterization) {
+                LOG_INFO("running in v-prediction EDM mode");
+                denoiser = std::make_shared<EDMVDenoiser>();
+            } else {
+                LOG_INFO("running in eps-prediction mode");
+            }
        }

        auto comp_vis_denoiser = std::dynamic_pointer_cast<CompVisDenoiser>(denoiser);
@ -1742,6 +1782,31 @@ enum scheduler_t str_to_schedule(const char* str) {
    return SCHEDULE_COUNT;
 }

+const char* prediction_to_str[] = {
+    "default",
+    "eps",
+    "v",
+    "edm_v",
+    "sd3_flow",
+    "flux_flow",
+};
+
+const char* sd_prediction_name(enum prediction_t prediction) {
+    if (prediction < PREDICTION_COUNT) {
+        return prediction_to_str[prediction];
+    }
+    return NONE_STR;
+}
+
+enum prediction_t str_to_prediction(const char* str) {
+    for (int i = 0; i < PREDICTION_COUNT; i++) {
+        if (!strcmp(str, prediction_to_str[i])) {
+            return (enum prediction_t)i;
+        }
+    }
+    return PREDICTION_COUNT;
+}
+
 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
    *sd_ctx_params                         = {};
    sd_ctx_params->vae_decode_only         = true;
@ -1749,6 +1814,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
    sd_ctx_params->n_threads               = get_num_physical_cores();
    sd_ctx_params->wtype                   = SD_TYPE_COUNT;
    sd_ctx_params->rng_type                = CUDA_RNG;
+    sd_ctx_params->prediction              = DEFAULT_PRED;
    sd_ctx_params->offload_params_to_cpu   = false;
    sd_ctx_params->keep_clip_on_cpu        = false;
    sd_ctx_params->keep_control_net_on_cpu = false;
@ -1788,6 +1854,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
             "n_threads: %d\n"
             "wtype: %s\n"
             "rng_type: %s\n"
+             "prediction: %s\n"
             "offload_params_to_cpu: %s\n"
             "keep_clip_on_cpu: %s\n"
             "keep_control_net_on_cpu: %s\n"
@ -1816,6 +1883,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
             sd_ctx_params->n_threads,
             sd_type_name(sd_ctx_params->wtype),
             sd_rng_type_name(sd_ctx_params->rng_type),
+             sd_prediction_name(sd_ctx_params->prediction),
             BOOL_STR(sd_ctx_params->offload_params_to_cpu),
             BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
             BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -64,6 +64,16 @@ enum scheduler_t {
    SCHEDULE_COUNT
 };

+enum prediction_t {
+    DEFAULT_PRED,
+    EPS_PRED,
+    V_PRED,
+    EDM_V_PRED,
+    SD3_FLOW_PRED,
+    FLUX_FLOW_PRED,
+    PREDICTION_COUNT
+};
+
 // same as enum ggml_type
 enum sd_type_t {
    SD_TYPE_F32  = 0,
@ -146,6 +156,7 @@ typedef struct {
    int n_threads;
    enum sd_type_t wtype;
    enum rng_type_t rng_type;
+    enum prediction_t prediction;
    bool offload_params_to_cpu;
    bool keep_clip_on_cpu;
    bool keep_control_net_on_cpu;
@ -153,6 +164,7 @@ typedef struct {
    bool diffusion_flash_attn;
    bool diffusion_conv_direct;
    bool vae_conv_direct;
+    bool force_sdxl_vae_conv_scale;
    bool chroma_use_dit_mask;
    bool chroma_use_t5_mask;
    int chroma_t5_mask_pad;
@ -255,6 +267,8 @@ SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
 SD_API enum sample_method_t str_to_sample_method(const char* str);
 SD_API const char* sd_schedule_name(enum scheduler_t scheduler);
 SD_API enum scheduler_t str_to_schedule(const char* str);
+SD_API const char* sd_prediction_name(enum prediction_t prediction);
+SD_API enum prediction_t str_to_prediction(const char* str);

 SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
 SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
--- a/vae.hpp
+++ b/vae.hpp
@ -530,6 +530,7 @@ struct VAE : public GGMLRunner {
                         struct ggml_context* output_ctx)                                                         = 0;
    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
    virtual void enable_conv2d_direct(){};
+    virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
 };

 struct AutoEncoderKL : public VAE {
@ -558,6 +559,17 @@ struct AutoEncoderKL : public VAE {
        }
    }

+    void set_conv2d_scale(float scale) {
+        std::vector<GGMLBlock*> blocks;
+        ae.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->set_scale(scale);
+            }
+        }
+    }
+
    std::string get_desc() {
        return "vae";
    }
Author	SHA1	Message	Date
leejet	40a6a8710e	fix: resolve precision issues in SDXL VAE under fp16 (#888 ) * fix: resolve precision issues in SDXL VAE under fp16 * add --force-sdxl-vae-conv-scale option * update docs	2025-10-15 23:01:00 +08:00
Daniele	e3702585cb	feat: added prediction argument (#334 )	2025-10-15 23:00:10 +08:00
cmdr2	a7d6d296c7	chore: allow building ggml as a separate shared lib (#468 )	2025-10-15 22:10:26 +08:00