feat: add auto-resize support for reference images (was Qwen-Image-Edit only) (#898)

This commit is contained in:
leejet 2025-10-18 16:37:09 +08:00 committed by GitHub
parent db6f4791b4
commit 90ef5f8246
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 21 additions and 6 deletions

View File

@ -35,8 +35,9 @@ arguments:
-r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times)
--control-video [PATH] path to control video frames, It must be a directory path. --control-video [PATH] path to control video frames, It must be a directory path.
The video frames inside should be stored as images in lexicographical (character) order The video frames inside should be stored as images in lexicographical (character) order
For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc. For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, ... etc.
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images
-o, --output OUTPUT path to write result image to (default: ./output.png) -o, --output OUTPUT path to write result image to (default: ./output.png)
-p, --prompt [PROMPT] the prompt to render -p, --prompt [PROMPT] the prompt to render
-n, --negative-prompt PROMPT the negative prompt (default: "") -n, --negative-prompt PROMPT the negative prompt (default: "")

View File

@ -80,7 +80,8 @@ struct SDParams {
std::string control_image_path; std::string control_image_path;
std::vector<std::string> ref_image_paths; std::vector<std::string> ref_image_paths;
std::string control_video_path; std::string control_video_path;
bool increase_ref_index = false; bool auto_resize_ref_image = true;
bool increase_ref_index = false;
std::string prompt; std::string prompt;
std::string negative_prompt; std::string negative_prompt;
@ -175,6 +176,7 @@ void print_params(SDParams params) {
printf(" %s\n", path.c_str()); printf(" %s\n", path.c_str());
}; };
printf(" control_video_path: %s\n", params.control_video_path.c_str()); printf(" control_video_path: %s\n", params.control_video_path.c_str());
printf(" auto_resize_ref_image: %s\n", params.auto_resize_ref_image ? "true" : "false");
printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false"); printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false");
printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false"); printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false");
printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false"); printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false");
@ -244,9 +246,10 @@ void print_usage(int argc, const char* argv[]) {
printf(" -i, --end-img [IMAGE] path to the end image, required by flf2v\n"); printf(" -i, --end-img [IMAGE] path to the end image, required by flf2v\n");
printf(" --control-image [IMAGE] path to image condition, control net\n"); printf(" --control-image [IMAGE] path to image condition, control net\n");
printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n"); printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n");
printf(" --disable-auto-resize-ref-image disable auto resize of ref images\n");
printf(" --control-video [PATH] path to control video frames, It must be a directory path.\n"); printf(" --control-video [PATH] path to control video frames, It must be a directory path.\n");
printf(" The video frames inside should be stored as images in lexicographical (character) order\n"); printf(" The video frames inside should be stored as images in lexicographical (character) order\n");
printf(" For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, etc.\n"); printf(" For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, ... etc.\n");
printf(" --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).\n"); printf(" --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n"); printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");
printf(" -p, --prompt [PROMPT] the prompt to render\n"); printf(" -p, --prompt [PROMPT] the prompt to render\n");
@ -579,6 +582,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
{"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask}, {"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
{"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask}, {"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
{"", "--increase-ref-index", "", true, &params.increase_ref_index}, {"", "--increase-ref-index", "", true, &params.increase_ref_index},
{"", "--disable-auto-resize-ref-image", "", false, &params.auto_resize_ref_image},
}; };
auto on_mode_arg = [&](int argc, const char** argv, int index) { auto on_mode_arg = [&](int argc, const char** argv, int index) {
@ -1428,6 +1432,7 @@ int main(int argc, const char* argv[]) {
init_image, init_image,
ref_images.data(), ref_images.data(),
(int)ref_images.size(), (int)ref_images.size(),
params.auto_resize_ref_image,
params.increase_ref_index, params.increase_ref_index,
mask_image, mask_image,
params.width, params.width,

View File

@ -1970,6 +1970,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
"seed: %" PRId64 "seed: %" PRId64
"batch_count: %d\n" "batch_count: %d\n"
"ref_images_count: %d\n" "ref_images_count: %d\n"
"auto_resize_ref_image: %s\n"
"increase_ref_index: %s\n" "increase_ref_index: %s\n"
"control_strength: %.2f\n" "control_strength: %.2f\n"
"photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n" "photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n"
@ -1984,6 +1985,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
sd_img_gen_params->seed, sd_img_gen_params->seed,
sd_img_gen_params->batch_count, sd_img_gen_params->batch_count,
sd_img_gen_params->ref_images_count, sd_img_gen_params->ref_images_count,
BOOL_STR(sd_img_gen_params->auto_resize_ref_image),
BOOL_STR(sd_img_gen_params->increase_ref_index), BOOL_STR(sd_img_gen_params->increase_ref_index),
sd_img_gen_params->control_strength, sd_img_gen_params->control_strength,
sd_img_gen_params->pm_params.style_strength, sd_img_gen_params->pm_params.style_strength,
@ -2624,14 +2626,20 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
std::vector<ggml_tensor*> ref_latents; std::vector<ggml_tensor*> ref_latents;
for (int i = 0; i < ref_images.size(); i++) { for (int i = 0; i < ref_images.size(); i++) {
ggml_tensor* img; ggml_tensor* img;
if (sd_version_is_qwen_image(sd_ctx->sd->version)) { if (sd_img_gen_params->auto_resize_ref_image) {
LOG_DEBUG("auto resize ref images");
sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]); sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]);
int VAE_IMAGE_SIZE = std::min(1024 * 1024, width * height); int VAE_IMAGE_SIZE = std::min(1024 * 1024, width * height);
double vae_width = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height); double vae_width = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height);
double vae_height = vae_width * ref_image.height / ref_image.width; double vae_height = vae_width * ref_image.height / ref_image.width;
vae_height = round(vae_height / 32) * 32; int factor = 16;
vae_width = round(vae_width / 32) * 32; if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
factor = 32;
}
vae_height = round(vae_height / factor) * factor;
vae_width = round(vae_width / factor) * factor;
sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height)); sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height));
free(ref_image.data); free(ref_image.data);

View File

@ -216,6 +216,7 @@ typedef struct {
sd_image_t init_image; sd_image_t init_image;
sd_image_t* ref_images; sd_image_t* ref_images;
int ref_images_count; int ref_images_count;
bool auto_resize_ref_image;
bool increase_ref_index; bool increase_ref_index;
sd_image_t mask_image; sd_image_t mask_image;
int width; int width;