feat: add Instruct-Pix2pix/CosXL-Edit support (#679)

* Instruct-p2p support * support 2 conditionings cfg * Do not re-encode the exact same image twice * fixes for 2-cfg * Fix pix2pix latent inputs + improve inpainting a bit + fix naming * prepare for other pix2pix-like models * Support sdxl ip2p * fix reference image embeddings * Support 2-cond cfg properly in cli * fix typo in help * Support masks for ip2p models * unify code style * delete unused code * use edit mode * add img_cond * format code --------- Co-authored-by: leejet <leejet714@gmail.com>
2026-02-04 19:03:35 +00:00 · 2025-07-12 09:36:45 +02:00 · 2025-07-12 09:36:45 +02:00 · a772dca27a
commit a772dca27a
parent 6d84a30c66
6 changed files with 242 additions and 197 deletions
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -97,15 +97,16 @@ struct SDParams {

    std::string prompt;
    std::string negative_prompt;
-    float min_cfg     = 1.0f;
-    float cfg_scale   = 7.0f;
-    float guidance    = 3.5f;
-    float eta         = 0.f;
-    float style_ratio = 20.f;
-    int clip_skip     = -1;  // <= 0 represents unspecified
-    int width         = 512;
-    int height        = 512;
-    int batch_count   = 1;
+    float min_cfg       = 1.0f;
+    float cfg_scale     = 7.0f;
+    float img_cfg_scale = INFINITY;
+    float guidance      = 3.5f;
+    float eta           = 0.f;
+    float style_ratio   = 20.f;
+    int clip_skip       = -1;  // <= 0 represents unspecified
+    int width           = 512;
+    int height          = 512;
+    int batch_count     = 1;

    int video_frames         = 6;
    int motion_bucket_id     = 127;
@ -176,6 +177,7 @@ void print_params(SDParams params) {
    printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
    printf("    min_cfg:           %.2f\n", params.min_cfg);
    printf("    cfg_scale:         %.2f\n", params.cfg_scale);
+    printf("    img_cfg_scale:     %.2f\n", params.img_cfg_scale);
    printf("    slg_scale:         %.2f\n", params.slg_scale);
    printf("    guidance:          %.2f\n", params.guidance);
    printf("    eta:               %.2f\n", params.eta);
@ -234,7 +236,8 @@ void print_usage(int argc, const char* argv[]) {
    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
    printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
-    printf("  --guidance SCALE                   guidance scale for img2img (default: 3.5)\n");
+    printf("  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
+    printf("  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)\n");
    printf("  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
    printf("                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
    printf("  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)\n");
@ -470,6 +473,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                break;
            }
            params.cfg_scale = std::stof(argv[i]);
+        } else if (arg == "--img-cfg-scale") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.img_cfg_scale = std::stof(argv[i]);
        } else if (arg == "--guidance") {
            if (++i >= argc) {
                invalid_arg = true;
@ -755,6 +764,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
            params.output_path = "output.gguf";
        }
    }
+
+    if (!isfinite(params.img_cfg_scale)) {
+        params.img_cfg_scale = params.cfg_scale;
+    }
 }

 static std::string sd_basename(const std::string& path) {
@ -849,6 +862,18 @@ int main(int argc, const char* argv[]) {

    parse_args(argc, argv, params);

+    sd_guidance_params_t guidance_params = {params.cfg_scale,
+                                            params.img_cfg_scale,
+                                            params.min_cfg,
+                                            params.guidance,
+                                            {
+                                                params.skip_layers.data(),
+                                                params.skip_layers.size(),
+                                                params.skip_layer_start,
+                                                params.skip_layer_end,
+                                                params.slg_scale,
+                                            }};
+
    sd_set_log_callback(sd_log_cb, (void*)&params);

    if (params.verbose) {
@ -1041,8 +1066,7 @@ int main(int argc, const char* argv[]) {
                          params.prompt.c_str(),
                          params.negative_prompt.c_str(),
                          params.clip_skip,
-                          params.cfg_scale,
-                          params.guidance,
+                          guidance_params,
                          params.eta,
                          params.width,
                          params.height,
@ -1054,12 +1078,7 @@ int main(int argc, const char* argv[]) {
                          params.control_strength,
                          params.style_ratio,
                          params.normalize_input,
-                          params.input_id_images_path.c_str(),
-                          params.skip_layers.data(),
-                          params.skip_layers.size(),
-                          params.slg_scale,
-                          params.skip_layer_start,
-                          params.skip_layer_end);
+                          params.input_id_images_path.c_str());
    } else if (params.mode == IMG2IMG || params.mode == IMG2VID) {
        sd_image_t input_image = {(uint32_t)params.width,
                                  (uint32_t)params.height,
@ -1075,8 +1094,7 @@ int main(int argc, const char* argv[]) {
                              params.motion_bucket_id,
                              params.fps,
                              params.augmentation_level,
-                              params.min_cfg,
-                              params.cfg_scale,
+                              guidance_params,
                              params.sample_method,
                              params.sample_steps,
                              params.strength,
@ -1109,8 +1127,7 @@ int main(int argc, const char* argv[]) {
                              params.prompt.c_str(),
                              params.negative_prompt.c_str(),
                              params.clip_skip,
-                              params.cfg_scale,
-                              params.guidance,
+                              guidance_params,
                              params.eta,
                              params.width,
                              params.height,
@ -1123,12 +1140,7 @@ int main(int argc, const char* argv[]) {
                              params.control_strength,
                              params.style_ratio,
                              params.normalize_input,
-                              params.input_id_images_path.c_str(),
-                              params.skip_layers.data(),
-                              params.skip_layers.size(),
-                              params.slg_scale,
-                              params.skip_layer_start,
-                              params.skip_layer_end);
+                              params.input_id_images_path.c_str());
        }
    } else {  // EDIT
        results = edit(sd_ctx,
@ -1137,25 +1149,19 @@ int main(int argc, const char* argv[]) {
                       params.prompt.c_str(),
                       params.negative_prompt.c_str(),
                       params.clip_skip,
-                       params.cfg_scale,
-                       params.guidance,
+                       guidance_params,
                       params.eta,
                       params.width,
                       params.height,
                       params.sample_method,
                       params.sample_steps,
-                       params.strength,
                       params.seed,
                       params.batch_count,
                       control_image,
                       params.control_strength,
                       params.style_ratio,
                       params.normalize_input,
-                       params.skip_layers.data(),
-                       params.skip_layers.size(),
-                       params.slg_scale,
-                       params.skip_layer_start,
-                       params.skip_layer_end);
+                       params.input_id_images_path.c_str());
    }

    if (results == NULL) {
--- a/model.cpp
+++ b/model.cpp
@ -1673,10 +1673,14 @@ SDVersion ModelLoader::get_sd_version() {
        }
    }
    bool is_inpaint = input_block_weight.ne[2] == 9;
+    bool is_ip2p    = input_block_weight.ne[2] == 8;
    if (is_xl) {
        if (is_inpaint) {
            return VERSION_SDXL_INPAINT;
        }
+        if (is_ip2p) {
+            return VERSION_SDXL_PIX2PIX;
+        }
        return VERSION_SDXL;
    }

@ -1692,6 +1696,9 @@ SDVersion ModelLoader::get_sd_version() {
        if (is_inpaint) {
            return VERSION_SD1_INPAINT;
        }
+        if (is_ip2p) {
+            return VERSION_SD1_PIX2PIX;
+        }
        return VERSION_SD1;
    } else if (token_embedding_weight.ne[0] == 1024) {
        if (is_inpaint) {
--- a/model.h
+++ b/model.h
@ -21,10 +21,12 @@
 enum SDVersion {
    VERSION_SD1,
    VERSION_SD1_INPAINT,
+    VERSION_SD1_PIX2PIX,
    VERSION_SD2,
    VERSION_SD2_INPAINT,
    VERSION_SDXL,
    VERSION_SDXL_INPAINT,
+    VERSION_SDXL_PIX2PIX,
    VERSION_SVD,
    VERSION_SD3,
    VERSION_FLUX,
@ -47,7 +49,7 @@ static inline bool sd_version_is_sd3(SDVersion version) {
 }

 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) {
        return true;
    }
    return false;
@ -61,7 +63,7 @@ static inline bool sd_version_is_sd2(SDVersion version) {
 }

 static inline bool sd_version_is_sdxl(SDVersion version) {
-    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT) {
+    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX) {
        return true;
    }
    return false;
@ -81,6 +83,14 @@ static inline bool sd_version_is_dit(SDVersion version) {
    return false;
 }

+static inline bool sd_version_is_unet_edit(SDVersion version) {
+    return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX;
+}
+
+static bool sd_version_is_inpaint_or_unet_edit(SDVersion version) {
+    return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version);
+}
+
 enum PMVersion {
    PM_VERSION_1,
    PM_VERSION_2,
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -27,10 +27,12 @@
 const char* model_version_to_str[] = {
    "SD 1.x",
    "SD 1.x Inpaint",
+    "Instruct-Pix2Pix",
    "SD 2.x",
    "SD 2.x Inpaint",
    "SDXL",
    "SDXL Inpaint",
+    "SDXL Instruct-Pix2Pix",
    "SVD",
    "SD3.x",
    "Flux",
@ -824,22 +826,30 @@ public:
                        ggml_tensor* noise,
                        SDCondition cond,
                        SDCondition uncond,
+                        SDCondition img_cond,
                        ggml_tensor* control_hint,
                        float control_strength,
-                        float min_cfg,
-                        float cfg_scale,
-                        float guidance,
+                        sd_guidance_params_t guidance,
                        float eta,
                        sample_method_t method,
                        const std::vector<float>& sigmas,
                        int start_merge_step,
                        SDCondition id_cond,
                        std::vector<ggml_tensor*> ref_latents = {},
-                        std::vector<int> skip_layers          = {},
-                        float slg_scale                       = 0,
-                        float skip_layer_start                = 0.01,
-                        float skip_layer_end                  = 0.2,
-                        ggml_tensor* noise_mask               = nullptr) {
+                        ggml_tensor* denoise_mask             = nullptr) {
+        std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
+
+        float cfg_scale     = guidance.txt_cfg;
+        float img_cfg_scale = guidance.img_cfg;
+        float slg_scale     = guidance.slg.scale;
+
+        float min_cfg = guidance.min_cfg;
+
+        if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) {
+            LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance...");
+            img_cfg_scale = cfg_scale;
+        }
+
        LOG_DEBUG("Sample");
        struct ggml_init_params params;
        size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
@ -861,13 +871,15 @@ public:

        struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);

-        bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
+        bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != NULL;
+        bool has_img_cond      = cfg_scale != img_cfg_scale && img_cond.c_crossattn != NULL;
        bool has_skiplayer     = slg_scale != 0.0 && skip_layers.size() > 0;

        // denoise wrapper
-        struct ggml_tensor* out_cond   = ggml_dup_tensor(work_ctx, x);
-        struct ggml_tensor* out_uncond = NULL;
-        struct ggml_tensor* out_skip   = NULL;
+        struct ggml_tensor* out_cond     = ggml_dup_tensor(work_ctx, x);
+        struct ggml_tensor* out_uncond   = NULL;
+        struct ggml_tensor* out_skip     = NULL;
+        struct ggml_tensor* out_img_cond = NULL;

        if (has_unconditioned) {
            out_uncond = ggml_dup_tensor(work_ctx, x);
@ -880,6 +892,9 @@ public:
                LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
            }
        }
+        if (has_img_cond) {
+            out_img_cond = ggml_dup_tensor(work_ctx, x);
+        }
        struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);

        auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
@ -897,7 +912,7 @@ public:
            float t = denoiser->sigma_to_t(sigma);
            std::vector<float> timesteps_vec(x->ne[3], t);  // [N, ]
            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
-            std::vector<float> guidance_vec(x->ne[3], guidance);
+            std::vector<float> guidance_vec(x->ne[3], guidance.distilled_guidance);
            auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);

            copy_ggml_tensor(noised_input, input);
@ -964,8 +979,25 @@ public:
                negative_data = (float*)out_uncond->data;
            }

+            float* img_cond_data = NULL;
+            if (has_img_cond) {
+                diffusion_model->compute(n_threads,
+                                         noised_input,
+                                         timesteps,
+                                         img_cond.c_crossattn,
+                                         img_cond.c_concat,
+                                         img_cond.c_vector,
+                                         guidance_tensor,
+                                         ref_latents,
+                                         -1,
+                                         controls,
+                                         control_strength,
+                                         &out_img_cond);
+                img_cond_data = (float*)out_img_cond->data;
+            }
+
            int step_count         = sigmas.size();
-            bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count);
+            bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count);
            float* skip_layer_data = NULL;
            if (is_skiplayer_step) {
                LOG_DEBUG("Skipping layers at step %d\n", step);
@ -999,8 +1031,17 @@ public:
                        int64_t i3  = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2];
                        float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3);
                    } else {
-                        latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
+                        if (has_img_cond) {
+                            // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
+                            latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
+                        } else {
+                            // img_cfg_scale == cfg_scale
+                            latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
+                        }
                    }
+                } else if (has_img_cond) {
+                    // img_cfg_scale == 1
+                    latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]);
                }
                if (is_skiplayer_step) {
                    latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
@ -1014,10 +1055,10 @@ public:
                pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
            }
-            if (noise_mask != nullptr) {
+            if (denoise_mask != nullptr) {
                for (int64_t x = 0; x < denoised->ne[0]; x++) {
                    for (int64_t y = 0; y < denoised->ne[1]; y++) {
-                        float mask = ggml_tensor_get_f32(noise_mask, x, y);
+                        float mask = ggml_tensor_get_f32(denoise_mask, x, y);
                        for (int64_t k = 0; k < denoised->ne[2]; k++) {
                            float init = ggml_tensor_get_f32(init_latent, x, y, k);
                            float den  = ggml_tensor_get_f32(denoised, x, y, k);
@ -1240,8 +1281,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                           std::string prompt,
                           std::string negative_prompt,
                           int clip_skip,
-                           float cfg_scale,
-                           float guidance,
+                           sd_guidance_params_t guidance,
                           float eta,
                           int width,
                           int height,
@ -1255,11 +1295,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                           bool normalize_input,
                           std::string input_id_images_path,
                           std::vector<ggml_tensor*> ref_latents,
-                           std::vector<int> skip_layers = {},
-                           float slg_scale              = 0,
-                           float skip_layer_start       = 0.01,
-                           float skip_layer_end         = 0.2,
-                           ggml_tensor* masked_image    = NULL) {
+                           ggml_tensor* concat_latent = NULL,
+                           ggml_tensor* denoise_mask  = NULL) {
    if (seed < 0) {
        // Generally, when using the provided command line, the seed is always >0.
        // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@ -1407,7 +1444,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                                           sd_ctx->sd->diffusion_model->get_adm_in_channels());

    SDCondition uncond;
-    if (cfg_scale != 1.0) {
+    if (guidance.txt_cfg != 1.0 ||
+        (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
        bool force_zero_embeddings = false;
        if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) {
            force_zero_embeddings = true;
@ -1446,38 +1484,50 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
    int W = width / 8;
    int H = height / 8;
    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
-    ggml_tensor* noise_mask = nullptr;
    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-        if (masked_image == NULL) {
-            int64_t mask_channels = 1;
-            if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                mask_channels = 8 * 8;  // flatten the whole mask
-            }
-            // no mask, set the whole image as masked
-            masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
-            for (int64_t x = 0; x < masked_image->ne[0]; x++) {
-                for (int64_t y = 0; y < masked_image->ne[1]; y++) {
-                    if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                        // TODO: this might be wrong
-                        for (int64_t c = 0; c < init_latent->ne[2]; c++) {
-                            ggml_tensor_set_f32(masked_image, 0, x, y, c);
-                        }
-                        for (int64_t c = init_latent->ne[2]; c < masked_image->ne[2]; c++) {
-                            ggml_tensor_set_f32(masked_image, 1, x, y, c);
-                        }
-                    } else {
-                        ggml_tensor_set_f32(masked_image, 1, x, y, 0);
-                        for (int64_t c = 1; c < masked_image->ne[2]; c++) {
-                            ggml_tensor_set_f32(masked_image, 0, x, y, c);
-                        }
+        int64_t mask_channels = 1;
+        if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+            mask_channels = 8 * 8;  // flatten the whole mask
+        }
+        auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
+        // no mask, set the whole image as masked
+        for (int64_t x = 0; x < empty_latent->ne[0]; x++) {
+            for (int64_t y = 0; y < empty_latent->ne[1]; y++) {
+                if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+                    // TODO: this might be wrong
+                    for (int64_t c = 0; c < init_latent->ne[2]; c++) {
+                        ggml_tensor_set_f32(empty_latent, 0, x, y, c);
+                    }
+                    for (int64_t c = init_latent->ne[2]; c < empty_latent->ne[2]; c++) {
+                        ggml_tensor_set_f32(empty_latent, 1, x, y, c);
+                    }
+                } else {
+                    ggml_tensor_set_f32(empty_latent, 1, x, y, 0);
+                    for (int64_t c = 1; c < empty_latent->ne[2]; c++) {
+                        ggml_tensor_set_f32(empty_latent, 0, x, y, c);
                    }
                }
            }
        }
-        cond.c_concat   = masked_image;
-        uncond.c_concat = masked_image;
-    } else {
-        noise_mask = masked_image;
+        if (concat_latent == NULL) {
+            concat_latent = empty_latent;
+        }
+        cond.c_concat   = concat_latent;
+        uncond.c_concat = empty_latent;
+        denoise_mask    = NULL;
+    } else if (sd_version_is_unet_edit(sd_ctx->sd->version)) {
+        auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
+        ggml_set_f32(empty_latent, 0);
+        uncond.c_concat = empty_latent;
+        if (concat_latent == NULL) {
+            concat_latent = empty_latent;
+        }
+        cond.c_concat = ref_latents[0];
+    }
+    SDCondition img_cond;
+    if (uncond.c_crossattn != NULL &&
+        (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
+        img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat);
    }
    for (int b = 0; b < batch_count; b++) {
        int64_t sampling_start = ggml_time_ms();
@ -1497,15 +1547,17 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
            LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
        }

+        // Disable min_cfg
+        guidance.min_cfg = guidance.txt_cfg;
+
        struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
                                                     x_t,
                                                     noise,
                                                     cond,
                                                     uncond,
+                                                     img_cond,
                                                     image_hint,
                                                     control_strength,
-                                                     cfg_scale,
-                                                     cfg_scale,
                                                     guidance,
                                                     eta,
                                                     sample_method,
@ -1513,11 +1565,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                     start_merge_step,
                                                     id_cond,
                                                     ref_latents,
-                                                     skip_layers,
-                                                     slg_scale,
-                                                     skip_layer_start,
-                                                     skip_layer_end,
-                                                     noise_mask);
+                                                     denoise_mask);

        // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
        // print_ggml_tensor(x_0);
@ -1595,8 +1643,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                    const char* prompt_c_str,
                    const char* negative_prompt_c_str,
                    int clip_skip,
-                    float cfg_scale,
-                    float guidance,
+                    sd_guidance_params_t guidance,
                    float eta,
                    int width,
                    int height,
@ -1608,13 +1655,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                    float control_strength,
                    float style_ratio,
                    bool normalize_input,
-                    const char* input_id_images_path_c_str,
-                    int* skip_layers         = NULL,
-                    size_t skip_layers_count = 0,
-                    float slg_scale          = 0,
-                    float skip_layer_start   = 0.01,
-                    float skip_layer_end     = 0.2) {
-    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
+                    const char* input_id_images_path_c_str) {
    LOG_DEBUG("txt2img %dx%d", width, height);
    if (sd_ctx == NULL) {
        return NULL;
@ -1659,7 +1700,6 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                                               prompt_c_str,
                                               negative_prompt_c_str,
                                               clip_skip,
-                                               cfg_scale,
                                               guidance,
                                               eta,
                                               width,
@ -1673,11 +1713,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                                               style_ratio,
                                               normalize_input,
                                               input_id_images_path_c_str,
-                                               {},
-                                               skip_layers_vec,
-                                               slg_scale,
-                                               skip_layer_start,
-                                               skip_layer_end);
+                                               {});

    size_t t1 = ggml_time_ms();

@ -1692,8 +1728,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                    const char* prompt_c_str,
                    const char* negative_prompt_c_str,
                    int clip_skip,
-                    float cfg_scale,
-                    float guidance,
+                    sd_guidance_params_t guidance,
                    float eta,
                    int width,
                    int height,
@ -1706,13 +1741,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                    float control_strength,
                    float style_ratio,
                    bool normalize_input,
-                    const char* input_id_images_path_c_str,
-                    int* skip_layers         = NULL,
-                    size_t skip_layers_count = 0,
-                    float slg_scale          = 0,
-                    float skip_layer_start   = 0.01,
-                    float skip_layer_end     = 0.2) {
-    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
+                    const char* input_id_images_path_c_str) {
    LOG_DEBUG("img2img %dx%d", width, height);
    if (sd_ctx == NULL) {
        return NULL;
@ -1756,7 +1785,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,

    sd_image_to_tensor(init_image.data, init_img);

-    ggml_tensor* masked_image;
+    ggml_tensor* concat_latent;
+    ggml_tensor* denoise_mask = NULL;

    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
        int64_t mask_channels = 1;
@ -1765,22 +1795,22 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
        }
        ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
        sd_apply_mask(init_img, mask_img, masked_img);
-        ggml_tensor* masked_image_0 = NULL;
+        ggml_tensor* masked_latent = NULL;
        if (!sd_ctx->sd->use_tiny_autoencoder) {
            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
-            masked_image_0       = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+            masked_latent        = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
        } else {
-            masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
+            masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
        }
-        masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], mask_channels + masked_image_0->ne[2], 1);
-        for (int ix = 0; ix < masked_image_0->ne[0]; ix++) {
-            for (int iy = 0; iy < masked_image_0->ne[1]; iy++) {
+        concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent->ne[0], masked_latent->ne[1], mask_channels + masked_latent->ne[2], 1);
+        for (int ix = 0; ix < masked_latent->ne[0]; ix++) {
+            for (int iy = 0; iy < masked_latent->ne[1]; iy++) {
                int mx = ix * 8;
                int my = iy * 8;
                if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                    for (int k = 0; k < masked_image_0->ne[2]; k++) {
-                        float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
-                        ggml_tensor_set_f32(masked_image, v, ix, iy, k);
+                    for (int k = 0; k < masked_latent->ne[2]; k++) {
+                        float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
+                        ggml_tensor_set_f32(concat_latent, v, ix, iy, k);
                    }
                    // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
                    for (int x = 0; x < 8; x++) {
@ -1788,28 +1818,30 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                            float m = ggml_tensor_get_f32(mask_img, mx + x, my + y);
                            // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?)
                            // python code was using "b (h 8) (w 8) -> b (8 8) h w"
-                            ggml_tensor_set_f32(masked_image, m, ix, iy, masked_image_0->ne[2] + x * 8 + y);
+                            ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y);
                        }
                    }
                } else {
                    float m = ggml_tensor_get_f32(mask_img, mx, my);
-                    ggml_tensor_set_f32(masked_image, m, ix, iy, 0);
-                    for (int k = 0; k < masked_image_0->ne[2]; k++) {
-                        float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
-                        ggml_tensor_set_f32(masked_image, v, ix, iy, k + mask_channels);
+                    ggml_tensor_set_f32(concat_latent, m, ix, iy, 0);
+                    for (int k = 0; k < masked_latent->ne[2]; k++) {
+                        float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
+                        ggml_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels);
                    }
                }
            }
        }
-    } else {
+    }
+
+    {
        // LOG_WARN("Inpainting with a base model is not great");
-        masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
-        for (int ix = 0; ix < masked_image->ne[0]; ix++) {
-            for (int iy = 0; iy < masked_image->ne[1]; iy++) {
+        denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
+        for (int ix = 0; ix < denoise_mask->ne[0]; ix++) {
+            for (int iy = 0; iy < denoise_mask->ne[1]; iy++) {
                int mx  = ix * 8;
                int my  = iy * 8;
                float m = ggml_tensor_get_f32(mask_img, mx, my);
-                ggml_tensor_set_f32(masked_image, m, ix, iy);
+                ggml_tensor_set_f32(denoise_mask, m, ix, iy);
            }
        }
    }
@ -1822,7 +1854,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
        init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
    }

-    print_ggml_tensor(init_latent, true);
    size_t t1 = ggml_time_ms();
    LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);

@ -1840,7 +1871,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                               prompt_c_str,
                                               negative_prompt_c_str,
                                               clip_skip,
-                                               cfg_scale,
                                               guidance,
                                               eta,
                                               width,
@ -1855,11 +1885,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                               normalize_input,
                                               input_id_images_path_c_str,
                                               {},
-                                               skip_layers_vec,
-                                               slg_scale,
-                                               skip_layer_start,
-                                               skip_layer_end,
-                                               masked_image);
+                                               concat_latent,
+                                               denoise_mask);

    size_t t2 = ggml_time_ms();

@ -1876,8 +1903,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                           int motion_bucket_id,
                           int fps,
                           float augmentation_level,
-                           float min_cfg,
-                           float cfg_scale,
+                           sd_guidance_params_t guidance,
                           enum sample_method_t sample_method,
                           int sample_steps,
                           float strength,
@ -1953,10 +1979,9 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                                                 cond,
                                                 uncond,
                                                 {},
+                                                 {},
                                                 0.f,
-                                                 min_cfg,
-                                                 cfg_scale,
-                                                 0.f,
+                                                 guidance,
                                                 0.f,
                                                 sample_method,
                                                 sigmas,
@ -2007,26 +2032,19 @@ sd_image_t* edit(sd_ctx_t* sd_ctx,
                 const char* prompt_c_str,
                 const char* negative_prompt_c_str,
                 int clip_skip,
-                 float cfg_scale,
-                 float guidance,
+                 sd_guidance_params_t guidance,
                 float eta,
                 int width,
                 int height,
-                 sample_method_t sample_method,
+                 enum sample_method_t sample_method,
                 int sample_steps,
-                 float strength,
                 int64_t seed,
                 int batch_count,
                 const sd_image_t* control_cond,
                 float control_strength,
                 float style_ratio,
                 bool normalize_input,
-                 int* skip_layers         = NULL,
-                 size_t skip_layers_count = 0,
-                 float slg_scale          = 0,
-                 float skip_layer_start   = 0.01,
-                 float skip_layer_end     = 0.2) {
-    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
+                 const char* input_id_images_path_c_str) {
    LOG_DEBUG("edit %dx%d", width, height);
    if (sd_ctx == NULL) {
        return NULL;
@ -2064,11 +2082,21 @@ sd_image_t* edit(sd_ctx_t* sd_ctx,
        sd_image_to_tensor(ref_images[i].data, img);

        ggml_tensor* latent = NULL;
-        if (!sd_ctx->sd->use_tiny_autoencoder) {
+        if (sd_ctx->sd->use_tiny_autoencoder) {
+            latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
+        } else if (sd_ctx->sd->version == VERSION_SD1_PIX2PIX) {
+            latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
+            latent = ggml_view_3d(work_ctx,
+                                  latent,
+                                  latent->ne[0],
+                                  latent->ne[1],
+                                  latent->ne[2] / 2,
+                                  latent->nb[1],
+                                  latent->nb[2],
+                                  0);
+        } else {
            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
            latent               = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
-        } else {
-            latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
        }
        ref_latents.push_back(latent);
    }
@ -2086,7 +2114,6 @@ sd_image_t* edit(sd_ctx_t* sd_ctx,
                                               prompt_c_str,
                                               negative_prompt_c_str,
                                               clip_skip,
-                                               cfg_scale,
                                               guidance,
                                               eta,
                                               width,
@ -2101,10 +2128,6 @@ sd_image_t* edit(sd_ctx_t* sd_ctx,
                                               normalize_input,
                                               "",
                                               ref_latents,
-                                               skip_layers_vec,
-                                               slg_scale,
-                                               skip_layer_start,
-                                               skip_layer_end,
                                               NULL);

    size_t t2 = ggml_time_ms();
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -129,6 +129,22 @@ typedef struct {

 typedef struct sd_ctx_t sd_ctx_t;

+typedef struct {
+    int* layers;
+    size_t layer_count;
+    float layer_start;
+    float layer_end;
+    float scale;
+} sd_slg_params_t;
+
+typedef struct {
+    float txt_cfg;
+    float img_cfg;
+    float min_cfg;
+    float distilled_guidance;
+    sd_slg_params_t slg;
+} sd_guidance_params_t;
+
 SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
                            const char* clip_l_path,
                            const char* clip_g_path,
@ -161,8 +177,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                           const char* prompt,
                           const char* negative_prompt,
                           int clip_skip,
-                           float cfg_scale,
-                           float guidance,
+                           sd_guidance_params_t guidance,
                           float eta,
                           int width,
                           int height,
@ -174,12 +189,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                           float control_strength,
                           float style_strength,
                           bool normalize_input,
-                           const char* input_id_images_path,
-                           int* skip_layers,
-                           size_t skip_layers_count,
-                           float slg_scale,
-                           float skip_layer_start,
-                           float skip_layer_end);
+                           const char* input_id_images_path);

 SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                           sd_image_t init_image,
@ -187,8 +197,7 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                           const char* prompt,
                           const char* negative_prompt,
                           int clip_skip,
-                           float cfg_scale,
-                           float guidance,
+                           sd_guidance_params_t guidance,
                           float eta,
                           int width,
                           int height,
@ -201,12 +210,7 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                           float control_strength,
                           float style_strength,
                           bool normalize_input,
-                           const char* input_id_images_path,
-                           int* skip_layers,
-                           size_t skip_layers_count,
-                           float slg_scale,
-                           float skip_layer_start,
-                           float skip_layer_end);
+                           const char* input_id_images_path);

 SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                           sd_image_t init_image,
@ -216,8 +220,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                           int motion_bucket_id,
                           int fps,
                           float augmentation_level,
-                           float min_cfg,
-                           float cfg_scale,
+                           sd_guidance_params_t guidance,
                           enum sample_method_t sample_method,
                           int sample_steps,
                           float strength,
@ -229,25 +232,19 @@ SD_API sd_image_t* edit(sd_ctx_t* sd_ctx,
                        const char* prompt,
                        const char* negative_prompt,
                        int clip_skip,
-                        float cfg_scale,
-                        float guidance,
+                        sd_guidance_params_t guidance,
                        float eta,
                        int width,
                        int height,
                        enum sample_method_t sample_method,
                        int sample_steps,
-                        float strength,
                        int64_t seed,
                        int batch_count,
                        const sd_image_t* control_cond,
                        float control_strength,
                        float style_strength,
                        bool normalize_input,
-                        int* skip_layers,
-                        size_t skip_layers_count,
-                        float slg_scale,
-                        float skip_layer_start,
-                        float skip_layer_end);
+                        const char* input_id_images_path);

 typedef struct upscaler_ctx_t upscaler_ctx_t;

--- a/unet.hpp
+++ b/unet.hpp
@ -207,6 +207,8 @@ public:
        }
        if (sd_version_is_inpaint(version)) {
            in_channels = 9;
+        } else if (sd_version_is_unet_edit(version)) {
+            in_channels = 8;
        }

        // dims is always 2