docs: update docs and help message

feat: add flow shift parameter (for SD3 and Wan) (#780 )
* Add flow shift parameter (for SD3 and Wan) * unify code style and fix some issues --------- Co-authored-by: leejet <leejet714@gmail.com>
2025-12-13 05:48:56 +00:00 · 2025-09-07 02:26:44 +08:00 · 2025-09-07 02:16:59 +08:00 · 2025-09-07 01:44:10 +08:00
6 changed files with 67 additions and 21 deletions
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 Diffusion model(SD,Flux,Wan,...) inference in pure C/C++

 ***Note that this project is under active development. \
-API and command-line parameters may change frequently.***
+API and command-line option may change frequently.***

 ## Features

@ -290,9 +290,10 @@ usage: ./bin/sd [arguments]

 arguments:
  -h, --help                         show this help message and exit
-  -M, --mode [MODE]                  run mode, one of: [img_gen, convert], default: img_gen
+  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, convert], default: img_gen
  -t, --threads N                    number of threads to use during computation (default: -1)
                                     If threads <= 0, then threads will be set to the number of CPU physical cores
+  --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
  -m, --model [MODEL]                path to full model
  --diffusion-model                  path to the standalone diffusion model
  --high-noise-diffusion-model       path to the standalone high noise diffusion model
@ -346,7 +347,7 @@ arguments:
  --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
                                     (high noise) sampling method (default: "euler_a")
-  --high-noise-steps  STEPS          (high noise) number of sample steps (default: 20)
+  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)
                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)
@ -377,6 +378,9 @@ arguments:
  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma
  --video-frames                     video frames (default: 1)
  --fps                              fps (default: 24)
+  --moe-boundary BOUNDARY            timestep boundary for Wan2.2 MoE model. (default: 0.875)
+                                     only enabled if `--high-noise-steps` is set to -1
+  --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)
  -v, --verbose                      print extra info
 ```

--- a/denoiser.hpp
+++ b/denoiser.hpp
@ -382,7 +382,8 @@ struct DiscreteFlowDenoiser : public Denoiser {

    float sigma_data = 1.0f;

-    DiscreteFlowDenoiser() {
+    DiscreteFlowDenoiser(float shift = 3.0f)
+        : shift(shift) {
        set_parameters();
    }

--- a/docs/wan.md
+++ b/docs/wan.md
@ -43,8 +43,6 @@

 ## Examples

-Since GitHub does not support AVI files, the file I uploaded was converted from AVI to MP4.
-
 ### Wan2.1 T2V 1.3B

 ```
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -89,6 +89,8 @@ struct SDParams {
    std::vector<int> high_noise_skip_layers = {7, 8, 9};
    sd_sample_params_t high_noise_sample_params;

+    float moe_boundary = 0.875f;
+
    int video_frames = 1;
    int fps          = 16;

@ -113,10 +115,12 @@ struct SDParams {
    bool chroma_use_dit_mask = true;
    bool chroma_use_t5_mask  = false;
    int chroma_t5_mask_pad   = 1;
+    float flow_shift         = INFINITY;

    SDParams() {
        sd_sample_params_init(&sample_params);
        sd_sample_params_init(&high_noise_sample_params);
+        high_noise_sample_params.sample_steps = -1;
    }
 };

@ -167,6 +171,8 @@ void print_params(SDParams params) {
    printf("    height:                            %d\n", params.height);
    printf("    sample_params:                     %s\n", SAFE_STR(sample_params_str));
    printf("    high_noise_sample_params:          %s\n", SAFE_STR(high_noise_sample_params_str));
+    printf("    moe_boundary:                      %.3f\n", params.moe_boundary);
+    printf("    flow_shift:                        %.2f\n", params.flow_shift);
    printf("    strength(img2img):                 %.2f\n", params.strength);
    printf("    rng:                               %s\n", sd_rng_type_name(params.rng_type));
    printf("    seed:                              %ld\n", params.seed);
@ -187,9 +193,10 @@ void print_usage(int argc, const char* argv[]) {
    printf("\n");
    printf("arguments:\n");
    printf("  -h, --help                         show this help message and exit\n");
-    printf("  -M, --mode [MODE]                  run mode, one of: [img_gen, convert], default: img_gen\n");
+    printf("  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, convert], default: img_gen\n");
    printf("  -t, --threads N                    number of threads to use during computation (default: -1)\n");
    printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
+    printf("  --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed\n");
    printf("  -m, --model [MODEL]                path to full model\n");
    printf("  --diffusion-model                  path to the standalone diffusion model\n");
    printf("  --high-noise-diffusion-model       path to the standalone high noise diffusion model\n");
@ -243,7 +250,7 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n");
    printf("  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
    printf("                                     (high noise) sampling method (default: \"euler_a\")\n");
-    printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: 20)\n");
+    printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)\n");
    printf("                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
    printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
    printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)\n");
@ -274,6 +281,9 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma\n");
    printf("  --video-frames                     video frames (default: 1)\n");
    printf("  --fps                              fps (default: 24)\n");
+    printf("  --moe-boundary BOUNDARY            timestep boundary for Wan2.2 MoE model. (default: 0.875)\n");
+    printf("                                     only enabled if `--high-noise-steps` is set to -1\n");
+    printf("  --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)\n");
    printf("  -v, --verbose                      print extra info\n");
 }

@ -362,7 +372,7 @@ bool parse_options(int argc, const char** argv, ArgOptions& options) {
    std::string arg;
    for (int i = 1; i < argc; i++) {
        bool found_arg = false;
-        arg = argv[i];
+        arg            = argv[i];

        for (auto& option : options.string_options) {
            if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
@ -423,7 +433,7 @@ bool parse_options(int argc, const char** argv, ArgOptions& options) {
        for (auto& option : options.manual_options) {
            if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
                found_arg = true;
-                int ret = option.cb(argc, argv, i);
+                int ret   = option.cb(argc, argv, i);
                if (ret < 0) {
                    invalid_arg = true;
                    break;
@ -435,7 +445,7 @@ bool parse_options(int argc, const char** argv, ArgOptions& options) {
            break;
        }
        if (!found_arg) {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());    
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            return false;
        }
    }
@ -507,6 +517,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        {"", "--strength", "", &params.strength},
        {"", "--style-ratio", "", &params.style_ratio},
        {"", "--control-strength", "", &params.control_strength},
+        {"", "--moe-boundary", "", &params.moe_boundary},
+        {"", "--flow-shift", "", &params.flow_shift},
    };

    options.bool_options = {
@ -767,8 +779,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
    }

    if (params.high_noise_sample_params.sample_steps <= 0) {
-        fprintf(stderr, "error: the high_noise_sample_steps must be greater than 0\n");
-        exit(1);
+        params.high_noise_sample_params.sample_steps = -1;
    }

    if (params.strength < 0.f || params.strength > 1.f) {
@ -1175,6 +1186,7 @@ int main(int argc, const char* argv[]) {
        params.chroma_use_dit_mask,
        params.chroma_use_t5_mask,
        params.chroma_t5_mask_pad,
+        params.flow_shift,
    };

    sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
@ -1222,6 +1234,7 @@ int main(int argc, const char* argv[]) {
            params.height,
            params.sample_params,
            params.high_noise_sample_params,
+            params.moe_boundary,
            params.strength,
            params.seed,
            params.video_frames,
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -681,7 +681,11 @@ public:

        if (sd_version_is_sd3(version)) {
            LOG_INFO("running in FLOW mode");
-            denoiser = std::make_shared<DiscreteFlowDenoiser>();
+            float shift = sd_ctx_params->flow_shift;
+            if (shift == INFINITY) {
+                shift = 3.0;
+            }
+            denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
        } else if (sd_version_is_flux(version)) {
            LOG_INFO("running in Flux FLOW mode");
            float shift = 1.0f;  // TODO: validate
@ -694,7 +698,11 @@ public:
            denoiser = std::make_shared<FluxFlowDenoiser>(shift);
        } else if (sd_version_is_wan(version)) {
            LOG_INFO("running in FLOW mode");
-            denoiser = std::make_shared<DiscreteFlowDenoiser>();
+            float shift = sd_ctx_params->flow_shift;
+            if (shift == INFINITY) {
+                shift = 5.0;
+            }
+            denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
        } else if (is_using_v_parameterization) {
            LOG_INFO("running in v-prediction mode");
            denoiser = std::make_shared<CompVisVDenoiser>();
@ -1553,6 +1561,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
    sd_ctx_params->chroma_use_dit_mask     = true;
    sd_ctx_params->chroma_use_t5_mask      = false;
    sd_ctx_params->chroma_t5_mask_pad      = 1;
+    sd_ctx_params->flow_shift              = INFINITY;
 }

 char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
@ -1727,11 +1736,13 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
    memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
    sd_sample_params_init(&sd_vid_gen_params->sample_params);
    sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params);
-    sd_vid_gen_params->width        = 512;
-    sd_vid_gen_params->height       = 512;
-    sd_vid_gen_params->strength     = 0.75f;
-    sd_vid_gen_params->seed         = -1;
-    sd_vid_gen_params->video_frames = 6;
+    sd_vid_gen_params->high_noise_sample_params.sample_steps = -1;
+    sd_vid_gen_params->width                                 = 512;
+    sd_vid_gen_params->height                                = 512;
+    sd_vid_gen_params->strength                              = 0.75f;
+    sd_vid_gen_params->seed                                  = -1;
+    sd_vid_gen_params->video_frames                          = 6;
+    sd_vid_gen_params->moe_boundary                          = 0.875f;
 }

 struct sd_ctx_t {
@ -2381,7 +2392,24 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
        high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps;
    }

-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps + high_noise_sample_steps);
+    int total_steps = sample_steps;
+
+    if (high_noise_sample_steps > 0) {
+        total_steps += high_noise_sample_steps;
+    }
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps);
+
+    if (high_noise_sample_steps < 0) {
+        // timesteps ∝ sigmas for Flow models (like wan2.2 a14b)
+        for (size_t i = 0; i < sigmas.size(); ++i) {
+            if (sigmas[i] < sd_vid_gen_params->moe_boundary) {
+                high_noise_sample_steps = i;
+                break;
+            }
+        }
+        LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps);
+        sample_steps = total_steps - high_noise_sample_steps;
+    }

    struct ggml_init_params params;
    params.mem_size = static_cast<size_t>(200 * 1024) * 1024;  // 200 MB
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -142,6 +142,7 @@ typedef struct {
    bool chroma_use_dit_mask;
    bool chroma_use_t5_mask;
    int chroma_t5_mask_pad;
+    float flow_shift;
 } sd_ctx_params_t;

 typedef struct {
@ -205,6 +206,7 @@ typedef struct {
    int height;
    sd_sample_params_t sample_params;
    sd_sample_params_t high_noise_sample_params;
+    float moe_boundary;
    float strength;
    int64_t seed;
    int video_frames;
Author	SHA1	Message	Date
leejet	d7f430cd69	docs: update docs and help message	2025-09-07 02:26:44 +08:00
stduhpf	141a4b4113	feat: add flow shift parameter (for SD3 and Wan) (#780 ) * Add flow shift parameter (for SD3 and Wan) * unify code style and fix some issues --------- Co-authored-by: leejet <leejet714@gmail.com>	2025-09-07 02:16:59 +08:00
stduhpf	21ce9fe2cf	feat: add support for timestep boundary based automatic expert routing in Wan MoE (#779 ) * Wan MoE: Automatic expert routing based on timestep boundary * unify code style and fix some issues --------- Co-authored-by: leejet <leejet714@gmail.com>	2025-09-07 01:44:10 +08:00