From 53aeb555bd1de21242691aecd3aa6de97ccc067b Mon Sep 17 00:00:00 2001 From: leejet Date: Mon, 8 Sep 2025 23:02:38 +0800 Subject: [PATCH] add --vace-strength option --- diffusion_model.hpp | 2 +- examples/cli/main.cpp | 12 ++++++++---- stable-diffusion.cpp | 23 +++++++++++++---------- stable-diffusion.h | 1 + wan.hpp | 14 +++++++------- 5 files changed, 30 insertions(+), 22 deletions(-) diff --git a/diffusion_model.hpp b/diffusion_model.hpp index 896bed4..995a6a0 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -14,7 +14,7 @@ struct DiffusionParams { struct ggml_tensor* y = NULL; struct ggml_tensor* guidance = NULL; std::vector ref_latents = {}; - bool increase_ref_index = false; + bool increase_ref_index = false; int num_video_frames = -1; std::vector controls = {}; float control_strength = 0.f; diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 098c98c..7779db2 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -91,10 +91,10 @@ struct SDParams { std::vector high_noise_skip_layers = {7, 8, 9}; sd_sample_params_t high_noise_sample_params; - float moe_boundary = 0.875f; - - int video_frames = 1; - int fps = 16; + float moe_boundary = 0.875f; + int video_frames = 1; + int fps = 16; + float vace_strength = 1.f; float strength = 0.75f; float control_strength = 0.9f; @@ -186,6 +186,7 @@ void print_params(SDParams params) { printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false"); printf(" chroma_t5_mask_pad: %d\n", params.chroma_t5_mask_pad); printf(" video_frames: %d\n", params.video_frames); + printf(" vace_strength: %.2f\n", params.vace_strength); printf(" fps: %d\n", params.fps); free(sample_params_str); free(high_noise_sample_params_str); @@ -288,6 +289,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)\n"); printf(" only enabled if `--high-noise-steps` is set to -1\n"); printf(" --flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)\n"); + printf(" --vace-strength wan vace strength\n"); printf(" -v, --verbose print extra info\n"); } @@ -523,6 +525,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--control-strength", "", ¶ms.control_strength}, {"", "--moe-boundary", "", ¶ms.moe_boundary}, {"", "--flow-shift", "", ¶ms.flow_shift}, + {"", "--vace-strength", "", ¶ms.vace_strength}, }; options.bool_options = { @@ -1244,6 +1247,7 @@ int main(int argc, const char* argv[]) { params.strength, params.seed, params.video_frames, + params.vace_strength, }; results = generate_video(sd_ctx, &vid_gen_params, &num_results); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index d3c0a57..d2bf817 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1119,15 +1119,15 @@ public: } DiffusionParams diffusion_params; - diffusion_params.x = noised_input; - diffusion_params.timesteps = timesteps; - diffusion_params.guidance = guidance_tensor; - diffusion_params.ref_latents = ref_latents; + diffusion_params.x = noised_input; + diffusion_params.timesteps = timesteps; + diffusion_params.guidance = guidance_tensor; + diffusion_params.ref_latents = ref_latents; diffusion_params.increase_ref_index = increase_ref_index; - diffusion_params.controls = controls; - diffusion_params.control_strength = control_strength; - diffusion_params.vace_context = vace_context; - diffusion_params.vace_strength = vace_strength; + diffusion_params.controls = controls; + diffusion_params.control_strength = control_strength; + diffusion_params.vace_context = vace_context; + diffusion_params.vace_strength = vace_strength; if (start_merge_step == -1 || step <= start_merge_step) { // cond @@ -1728,6 +1728,7 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { sd_vid_gen_params->seed = -1; sd_vid_gen_params->video_frames = 6; sd_vid_gen_params->moe_boundary = 0.875f; + sd_vid_gen_params->vace_strength = 1.f; } struct sd_ctx_t { @@ -2644,7 +2645,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s {}, false, denoise_mask, - vace_context); + vace_context, + sd_vid_gen_params->vace_strength); int64_t sampling_end = ggml_time_ms(); LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); @@ -2678,7 +2680,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s {}, false, denoise_mask, - vace_context); + vace_context, + sd_vid_gen_params->vace_strength); int64_t sampling_end = ggml_time_ms(); LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); diff --git a/stable-diffusion.h b/stable-diffusion.h index 34b0d14..57aad81 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -211,6 +211,7 @@ typedef struct { float strength; int64_t seed; int video_frames; + float vace_strength; } sd_vid_gen_params_t; typedef struct sd_ctx_t sd_ctx_t; diff --git a/wan.hpp b/wan.hpp index 03bad4d..f4cf8ab 100644 --- a/wan.hpp +++ b/wan.hpp @@ -1533,12 +1533,12 @@ namespace WAN { } virtual struct ggml_tensor* forward(struct ggml_context* ctx, - ggml_backend_t backend, - struct ggml_tensor* x, - struct ggml_tensor* e, - struct ggml_tensor* pe, - struct ggml_tensor* context, - int64_t context_img_len = 257) { + ggml_backend_t backend, + struct ggml_tensor* x, + struct ggml_tensor* e, + struct ggml_tensor* pe, + struct ggml_tensor* context, + int64_t context_img_len = 257) { // x: [N, n_token, dim] // e: [N, 6, dim] or [N, T, 6, dim] // context: [N, context_img_len + context_txt_len, dim] @@ -1610,7 +1610,7 @@ namespace WAN { } std::pair forward(struct ggml_context* ctx, - ggml_backend_t backend, + ggml_backend_t backend, struct ggml_tensor* c, struct ggml_tensor* x, struct ggml_tensor* e,