Compare commits

...

3 Commits

Author SHA1 Message Date
leejet
d7f430cd69 docs: update docs and help message 2025-09-07 02:26:44 +08:00
stduhpf
141a4b4113
feat: add flow shift parameter (for SD3 and Wan) (#780)
* Add flow shift parameter (for SD3 and Wan)

* unify code style and fix some issues

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-07 02:16:59 +08:00
stduhpf
21ce9fe2cf
feat: add support for timestep boundary based automatic expert routing in Wan MoE (#779)
* Wan MoE: Automatic expert routing based on timestep boundary

* unify code style and fix some issues

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-07 01:44:10 +08:00
6 changed files with 67 additions and 21 deletions

View File

@ -7,7 +7,7 @@
Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
***Note that this project is under active development. \
API and command-line parameters may change frequently.***
API and command-line option may change frequently.***
## Features
@ -290,9 +290,10 @@ usage: ./bin/sd [arguments]
arguments:
-h, --help show this help message and exit
-M, --mode [MODE] run mode, one of: [img_gen, convert], default: img_gen
-M, --mode [MODE] run mode, one of: [img_gen, vid_gen, convert], default: img_gen
-t, --threads N number of threads to use during computation (default: -1)
If threads <= 0, then threads will be set to the number of CPU physical cores
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
-m, --model [MODEL] path to full model
--diffusion-model path to the standalone diffusion model
--high-noise-diffusion-model path to the standalone high noise diffusion model
@ -346,7 +347,7 @@ arguments:
--high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
--high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
(high noise) sampling method (default: "euler_a")
--high-noise-steps STEPS (high noise) number of sample steps (default: 20)
--high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
--strength STRENGTH strength for noising/unnoising (default: 0.75)
--style-ratio STYLE-RATIO strength for keeping input identity (default: 20)
@ -377,6 +378,9 @@ arguments:
--chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma
--video-frames video frames (default: 1)
--fps fps (default: 24)
--moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)
only enabled if `--high-noise-steps` is set to -1
--flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)
-v, --verbose print extra info
```

View File

@ -382,7 +382,8 @@ struct DiscreteFlowDenoiser : public Denoiser {
float sigma_data = 1.0f;
DiscreteFlowDenoiser() {
DiscreteFlowDenoiser(float shift = 3.0f)
: shift(shift) {
set_parameters();
}

View File

@ -43,8 +43,6 @@
## Examples
Since GitHub does not support AVI files, the file I uploaded was converted from AVI to MP4.
### Wan2.1 T2V 1.3B
```

View File

@ -89,6 +89,8 @@ struct SDParams {
std::vector<int> high_noise_skip_layers = {7, 8, 9};
sd_sample_params_t high_noise_sample_params;
float moe_boundary = 0.875f;
int video_frames = 1;
int fps = 16;
@ -113,10 +115,12 @@ struct SDParams {
bool chroma_use_dit_mask = true;
bool chroma_use_t5_mask = false;
int chroma_t5_mask_pad = 1;
float flow_shift = INFINITY;
SDParams() {
sd_sample_params_init(&sample_params);
sd_sample_params_init(&high_noise_sample_params);
high_noise_sample_params.sample_steps = -1;
}
};
@ -167,6 +171,8 @@ void print_params(SDParams params) {
printf(" height: %d\n", params.height);
printf(" sample_params: %s\n", SAFE_STR(sample_params_str));
printf(" high_noise_sample_params: %s\n", SAFE_STR(high_noise_sample_params_str));
printf(" moe_boundary: %.3f\n", params.moe_boundary);
printf(" flow_shift: %.2f\n", params.flow_shift);
printf(" strength(img2img): %.2f\n", params.strength);
printf(" rng: %s\n", sd_rng_type_name(params.rng_type));
printf(" seed: %ld\n", params.seed);
@ -187,9 +193,10 @@ void print_usage(int argc, const char* argv[]) {
printf("\n");
printf("arguments:\n");
printf(" -h, --help show this help message and exit\n");
printf(" -M, --mode [MODE] run mode, one of: [img_gen, convert], default: img_gen\n");
printf(" -M, --mode [MODE] run mode, one of: [img_gen, vid_gen, convert], default: img_gen\n");
printf(" -t, --threads N number of threads to use during computation (default: -1)\n");
printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
printf(" --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed\n");
printf(" -m, --model [MODEL] path to full model\n");
printf(" --diffusion-model path to the standalone diffusion model\n");
printf(" --high-noise-diffusion-model path to the standalone high noise diffusion model\n");
@ -243,7 +250,7 @@ void print_usage(int argc, const char* argv[]) {
printf(" --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n");
printf(" --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
printf(" (high noise) sampling method (default: \"euler_a\")\n");
printf(" --high-noise-steps STEPS (high noise) number of sample steps (default: 20)\n");
printf(" --high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)\n");
printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n");
printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20)\n");
@ -274,6 +281,9 @@ void print_usage(int argc, const char* argv[]) {
printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n");
printf(" --video-frames video frames (default: 1)\n");
printf(" --fps fps (default: 24)\n");
printf(" --moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)\n");
printf(" only enabled if `--high-noise-steps` is set to -1\n");
printf(" --flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)\n");
printf(" -v, --verbose print extra info\n");
}
@ -362,7 +372,7 @@ bool parse_options(int argc, const char** argv, ArgOptions& options) {
std::string arg;
for (int i = 1; i < argc; i++) {
bool found_arg = false;
arg = argv[i];
arg = argv[i];
for (auto& option : options.string_options) {
if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
@ -423,7 +433,7 @@ bool parse_options(int argc, const char** argv, ArgOptions& options) {
for (auto& option : options.manual_options) {
if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
found_arg = true;
int ret = option.cb(argc, argv, i);
int ret = option.cb(argc, argv, i);
if (ret < 0) {
invalid_arg = true;
break;
@ -435,7 +445,7 @@ bool parse_options(int argc, const char** argv, ArgOptions& options) {
break;
}
if (!found_arg) {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
return false;
}
}
@ -507,6 +517,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
{"", "--strength", "", &params.strength},
{"", "--style-ratio", "", &params.style_ratio},
{"", "--control-strength", "", &params.control_strength},
{"", "--moe-boundary", "", &params.moe_boundary},
{"", "--flow-shift", "", &params.flow_shift},
};
options.bool_options = {
@ -767,8 +779,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
}
if (params.high_noise_sample_params.sample_steps <= 0) {
fprintf(stderr, "error: the high_noise_sample_steps must be greater than 0\n");
exit(1);
params.high_noise_sample_params.sample_steps = -1;
}
if (params.strength < 0.f || params.strength > 1.f) {
@ -1175,6 +1186,7 @@ int main(int argc, const char* argv[]) {
params.chroma_use_dit_mask,
params.chroma_use_t5_mask,
params.chroma_t5_mask_pad,
params.flow_shift,
};
sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
@ -1222,6 +1234,7 @@ int main(int argc, const char* argv[]) {
params.height,
params.sample_params,
params.high_noise_sample_params,
params.moe_boundary,
params.strength,
params.seed,
params.video_frames,

View File

@ -681,7 +681,11 @@ public:
if (sd_version_is_sd3(version)) {
LOG_INFO("running in FLOW mode");
denoiser = std::make_shared<DiscreteFlowDenoiser>();
float shift = sd_ctx_params->flow_shift;
if (shift == INFINITY) {
shift = 3.0;
}
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
} else if (sd_version_is_flux(version)) {
LOG_INFO("running in Flux FLOW mode");
float shift = 1.0f; // TODO: validate
@ -694,7 +698,11 @@ public:
denoiser = std::make_shared<FluxFlowDenoiser>(shift);
} else if (sd_version_is_wan(version)) {
LOG_INFO("running in FLOW mode");
denoiser = std::make_shared<DiscreteFlowDenoiser>();
float shift = sd_ctx_params->flow_shift;
if (shift == INFINITY) {
shift = 5.0;
}
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
} else if (is_using_v_parameterization) {
LOG_INFO("running in v-prediction mode");
denoiser = std::make_shared<CompVisVDenoiser>();
@ -1553,6 +1561,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
sd_ctx_params->chroma_use_dit_mask = true;
sd_ctx_params->chroma_use_t5_mask = false;
sd_ctx_params->chroma_t5_mask_pad = 1;
sd_ctx_params->flow_shift = INFINITY;
}
char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
@ -1727,11 +1736,13 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
sd_sample_params_init(&sd_vid_gen_params->sample_params);
sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params);
sd_vid_gen_params->width = 512;
sd_vid_gen_params->height = 512;
sd_vid_gen_params->strength = 0.75f;
sd_vid_gen_params->seed = -1;
sd_vid_gen_params->video_frames = 6;
sd_vid_gen_params->high_noise_sample_params.sample_steps = -1;
sd_vid_gen_params->width = 512;
sd_vid_gen_params->height = 512;
sd_vid_gen_params->strength = 0.75f;
sd_vid_gen_params->seed = -1;
sd_vid_gen_params->video_frames = 6;
sd_vid_gen_params->moe_boundary = 0.875f;
}
struct sd_ctx_t {
@ -2381,7 +2392,24 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps;
}
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps + high_noise_sample_steps);
int total_steps = sample_steps;
if (high_noise_sample_steps > 0) {
total_steps += high_noise_sample_steps;
}
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps);
if (high_noise_sample_steps < 0) {
// timesteps ∝ sigmas for Flow models (like wan2.2 a14b)
for (size_t i = 0; i < sigmas.size(); ++i) {
if (sigmas[i] < sd_vid_gen_params->moe_boundary) {
high_noise_sample_steps = i;
break;
}
}
LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps);
sample_steps = total_steps - high_noise_sample_steps;
}
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(200 * 1024) * 1024; // 200 MB

View File

@ -142,6 +142,7 @@ typedef struct {
bool chroma_use_dit_mask;
bool chroma_use_t5_mask;
int chroma_t5_mask_pad;
float flow_shift;
} sd_ctx_params_t;
typedef struct {
@ -205,6 +206,7 @@ typedef struct {
int height;
sd_sample_params_t sample_params;
sd_sample_params_t high_noise_sample_params;
float moe_boundary;
float strength;
int64_t seed;
int video_frames;