feat: add auto-resize support for reference images (was Qwen-Image-Edit only) (#898)

2026-02-04 10:53:34 +00:00 · 2025-10-18 16:37:09 +08:00 · 2025-10-18 16:37:09 +08:00 · 90ef5f8246
commit 90ef5f8246
parent db6f4791b4
4 changed files with 21 additions and 6 deletions
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -35,8 +35,9 @@ arguments:
  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times)
  --control-video [PATH]             path to control video frames, It must be a directory path.
                                     The video frames inside should be stored as images in lexicographical (character) order
-                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc.
+                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, ... etc.
  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).
+  --disable-auto-resize-ref-image    disable auto resize of ref images
  -o, --output OUTPUT                path to write result image to (default: ./output.png)
  -p, --prompt [PROMPT]              the prompt to render
  -n, --negative-prompt PROMPT       the negative prompt (default: "")
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -80,7 +80,8 @@ struct SDParams {
    std::string control_image_path;
    std::vector<std::string> ref_image_paths;
    std::string control_video_path;
-    bool increase_ref_index = false;
+    bool auto_resize_ref_image = true;
+    bool increase_ref_index    = false;

    std::string prompt;
    std::string negative_prompt;
@ -175,6 +176,7 @@ void print_params(SDParams params) {
        printf("        %s\n", path.c_str());
    };
    printf("    control_video_path:                %s\n", params.control_video_path.c_str());
+    printf("    auto_resize_ref_image:             %s\n", params.auto_resize_ref_image ? "true" : "false");
    printf("    increase_ref_index:                %s\n", params.increase_ref_index ? "true" : "false");
    printf("    offload_params_to_cpu:             %s\n", params.offload_params_to_cpu ? "true" : "false");
    printf("    clip_on_cpu:                       %s\n", params.clip_on_cpu ? "true" : "false");
@ -244,9 +246,10 @@ void print_usage(int argc, const char* argv[]) {
    printf("  -i, --end-img [IMAGE]              path to the end image, required by flf2v\n");
    printf("  --control-image [IMAGE]            path to image condition, control net\n");
    printf("  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times) \n");
+    printf("  --disable-auto-resize-ref-image    disable auto resize of ref images\n");
    printf("  --control-video [PATH]             path to control video frames, It must be a directory path.\n");
    printf("                                     The video frames inside should be stored as images in lexicographical (character) order\n");
-    printf("                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, … etc.\n");
+    printf("                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, ... etc.\n");
    printf("  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
    printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
@ -579,6 +582,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        {"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
        {"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
        {"", "--increase-ref-index", "", true, &params.increase_ref_index},
+        {"", "--disable-auto-resize-ref-image", "", false, &params.auto_resize_ref_image},
    };

    auto on_mode_arg = [&](int argc, const char** argv, int index) {
@ -1428,6 +1432,7 @@ int main(int argc, const char* argv[]) {
                init_image,
                ref_images.data(),
                (int)ref_images.size(),
+                params.auto_resize_ref_image,
                params.increase_ref_index,
                mask_image,
                params.width,
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -1970,6 +1970,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
             "seed: %" PRId64
             "batch_count: %d\n"
             "ref_images_count: %d\n"
+             "auto_resize_ref_image: %s\n"
             "increase_ref_index: %s\n"
             "control_strength: %.2f\n"
             "photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n"
@ -1984,6 +1985,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
             sd_img_gen_params->seed,
             sd_img_gen_params->batch_count,
             sd_img_gen_params->ref_images_count,
+             BOOL_STR(sd_img_gen_params->auto_resize_ref_image),
             BOOL_STR(sd_img_gen_params->increase_ref_index),
             sd_img_gen_params->control_strength,
             sd_img_gen_params->pm_params.style_strength,
@ -2624,14 +2626,20 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
    std::vector<ggml_tensor*> ref_latents;
    for (int i = 0; i < ref_images.size(); i++) {
        ggml_tensor* img;
-        if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
+        if (sd_img_gen_params->auto_resize_ref_image) {
+            LOG_DEBUG("auto resize ref images");
            sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]);
            int VAE_IMAGE_SIZE       = std::min(1024 * 1024, width * height);
            double vae_width         = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height);
            double vae_height        = vae_width * ref_image.height / ref_image.width;

-            vae_height = round(vae_height / 32) * 32;
-            vae_width  = round(vae_width / 32) * 32;
+            int factor = 16;
+            if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
+                factor = 32;
+            }
+
+            vae_height = round(vae_height / factor) * factor;
+            vae_width  = round(vae_width / factor) * factor;

            sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height));
            free(ref_image.data);
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -216,6 +216,7 @@ typedef struct {
    sd_image_t init_image;
    sd_image_t* ref_images;
    int ref_images_count;
+    bool auto_resize_ref_image;
    bool increase_ref_index;
    sd_image_t mask_image;
    int width;