From 3d565871d9e2d48d693d1eba19ee3cf7bef17941 Mon Sep 17 00:00:00 2001 From: leejet Date: Fri, 26 Dec 2025 00:10:22 +0800 Subject: [PATCH] add support for ref_clip_feats --- diffusion_model.hpp | 3 ++- stable-diffusion.cpp | 21 ++++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/diffusion_model.hpp b/diffusion_model.hpp index 5cf7717..89a26a4 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -24,6 +24,7 @@ struct DiffusionParams { float vace_strength = 1.f; std::vector skip_layers = {}; std::vector extra_contexts; // for z-image-omni + std::vector ref_clip_feats; // for z-image-omni }; struct DiffusionModel { @@ -444,7 +445,7 @@ struct ZImageModel : public DiffusionModel { diffusion_params.timesteps, contexts, diffusion_params.ref_latents, - {}, + diffusion_params.ref_clip_feats, output, output_ctx); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 84c9ffe..9cd23e9 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1528,6 +1528,7 @@ public: int start_merge_step, SDCondition id_cond, std::vector ref_latents = {}, + std::vector ref_clip_feats = {}, bool increase_ref_index = false, ggml_tensor* denoise_mask = nullptr, ggml_tensor* vace_context = nullptr, @@ -1651,7 +1652,7 @@ public: TaylorSeerConfig tcfg; tcfg.enabled = (cache_params->mode == SD_CACHE_TAYLORSEER || - cache_params->mode == SD_CACHE_CACHE_DIT); + cache_params->mode == SD_CACHE_CACHE_DIT); tcfg.n_derivatives = cache_params->taylorseer_n_derivatives; tcfg.skip_interval_steps = cache_params->taylorseer_skip_interval; @@ -1921,6 +1922,7 @@ public: diffusion_params.timesteps = timesteps; diffusion_params.guidance = guidance_tensor; diffusion_params.ref_latents = ref_latents; + diffusion_params.ref_clip_feats = ref_clip_feats; diffusion_params.increase_ref_index = increase_ref_index; diffusion_params.controls = controls; diffusion_params.control_strength = control_strength; @@ -3103,6 +3105,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, sd_pm_params_t pm_params, std::vector ref_images, std::vector ref_latents, + std::vector ref_clip_feats, bool increase_ref_index, ggml_tensor* concat_latent = nullptr, ggml_tensor* denoise_mask = nullptr, @@ -3391,6 +3394,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, start_merge_step, id_cond, ref_latents, + ref_clip_feats, increase_ref_index, denoise_mask, nullptr, @@ -3657,6 +3661,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g } std::vector ref_latents; + std::vector ref_clip_feats; for (int i = 0; i < ref_images.size(); i++) { ggml_tensor* img; if (sd_img_gen_params->auto_resize_ref_image) { @@ -3703,6 +3708,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g ggml_tensor* latent = sd_ctx->sd->encode_first_stage(work_ctx, img); ref_latents.push_back(latent); + + auto clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, *ref_images[i], false, -2); + ref_clip_feats.push_back(clip_vision_output); } if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) { @@ -3730,6 +3738,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g sd_img_gen_params->pm_params, ref_images, ref_latents, + ref_clip_feats, sd_img_gen_params->increase_ref_index, concat_latent, denoise_mask, @@ -4098,8 +4107,9 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s high_noise_sample_method, high_noise_sigmas, -1, - {}, - {}, + {}, // id_cond + {}, // ref_latents + {}, // ref_clip_feats false, denoise_mask, vace_context, @@ -4135,8 +4145,9 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s sample_method, sigmas, -1, - {}, - {}, + {}, // id_cond + {}, // ref_latents + {}, // ref_clip_feats false, denoise_mask, vace_context,