2025-12-13 05:48:56 +00:00
6 changed files with 21 additions and 67 deletions
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
 ***Note that this project is under active development. \
-API and command-line option may change frequently.***
+API and command-line parameters may change frequently.***
 ## Features
@ -290,10 +290,9 @@ usage: ./bin/sd [arguments]
 arguments:
  -h, --help                         show this help message and exit
-  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, convert], default: img_gen
+  -M, --mode [MODE]                  run mode, one of: [img_gen, convert], default: img_gen
  -t, --threads N                    number of threads to use during computation (default: -1)
                                     If threads <= 0, then threads will be set to the number of CPU physical cores
  --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
  -m, --model [MODEL]                path to full model
  --diffusion-model                  path to the standalone diffusion model
  --high-noise-diffusion-model       path to the standalone high noise diffusion model
@ -347,7 +346,7 @@ arguments:
  --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
                                     (high noise) sampling method (default: "euler_a")
-  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)
+  --high-noise-steps  STEPS          (high noise) number of sample steps (default: 20)
                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)
@ -378,9 +377,6 @@ arguments:
  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma
  --video-frames                     video frames (default: 1)
  --fps                              fps (default: 24)
  --moe-boundary BOUNDARY            timestep boundary for Wan2.2 MoE model. (default: 0.875)
                                     only enabled if `--high-noise-steps` is set to -1
  --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)
  -v, --verbose                      print extra info
 ```
--- a/denoiser.hpp
+++ b/denoiser.hpp
@ -382,8 +382,7 @@ struct DiscreteFlowDenoiser : public Denoiser {
    float sigma_data = 1.0f;
-    DiscreteFlowDenoiser(float shift = 3.0f)
+    DiscreteFlowDenoiser() {
        : shift(shift) {
        set_parameters();
    }
--- a/docs/wan.md
+++ b/docs/wan.md
@ -43,6 +43,8 @@
 ## Examples
 Since GitHub does not support AVI files, the file I uploaded was converted from AVI to MP4.
 ### Wan2.1 T2V 1.3B
 ```
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -89,8 +89,6 @@ struct SDParams {
    std::vector<int> high_noise_skip_layers = {7, 8, 9};
    sd_sample_params_t high_noise_sample_params;
    float moe_boundary = 0.875f;
    int video_frames = 1;
    int fps          = 16;
@ -115,12 +113,10 @@ struct SDParams {
    bool chroma_use_dit_mask = true;
    bool chroma_use_t5_mask  = false;
    int chroma_t5_mask_pad   = 1;
    float flow_shift         = INFINITY;
    SDParams() {
        sd_sample_params_init(&sample_params);
        sd_sample_params_init(&high_noise_sample_params);
        high_noise_sample_params.sample_steps = -1;
    }
 };
@ -171,8 +167,6 @@ void print_params(SDParams params) {
    printf("    height:                            %d\n", params.height);
    printf("    sample_params:                     %s\n", SAFE_STR(sample_params_str));
    printf("    high_noise_sample_params:          %s\n", SAFE_STR(high_noise_sample_params_str));
    printf("    moe_boundary:                      %.3f\n", params.moe_boundary);
    printf("    flow_shift:                        %.2f\n", params.flow_shift);
    printf("    strength(img2img):                 %.2f\n", params.strength);
    printf("    rng:                               %s\n", sd_rng_type_name(params.rng_type));
    printf("    seed:                              %ld\n", params.seed);
@ -193,10 +187,9 @@ void print_usage(int argc, const char* argv[]) {
    printf("\n");
    printf("arguments:\n");
    printf("  -h, --help                         show this help message and exit\n");
-    printf("  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, convert], default: img_gen\n");
+    printf("  -M, --mode [MODE]                  run mode, one of: [img_gen, convert], default: img_gen\n");
    printf("  -t, --threads N                    number of threads to use during computation (default: -1)\n");
    printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
    printf("  --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed\n");
    printf("  -m, --model [MODEL]                path to full model\n");
    printf("  --diffusion-model                  path to the standalone diffusion model\n");
    printf("  --high-noise-diffusion-model       path to the standalone high noise diffusion model\n");
@ -250,7 +243,7 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n");
    printf("  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
    printf("                                     (high noise) sampling method (default: \"euler_a\")\n");
-    printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)\n");
+    printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: 20)\n");
    printf("                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
    printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
    printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)\n");
@ -281,9 +274,6 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma\n");
    printf("  --video-frames                     video frames (default: 1)\n");
    printf("  --fps                              fps (default: 24)\n");
    printf("  --moe-boundary BOUNDARY            timestep boundary for Wan2.2 MoE model. (default: 0.875)\n");
    printf("                                     only enabled if `--high-noise-steps` is set to -1\n");
    printf("  --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)\n");
    printf("  -v, --verbose                      print extra info\n");
 }
@ -517,8 +507,6 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        {"", "--strength", "", &params.strength},
        {"", "--style-ratio", "", &params.style_ratio},
        {"", "--control-strength", "", &params.control_strength},
        {"", "--moe-boundary", "", &params.moe_boundary},
        {"", "--flow-shift", "", &params.flow_shift},
    };
    options.bool_options = {
@ -779,7 +767,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
    }
    if (params.high_noise_sample_params.sample_steps <= 0) {
-        params.high_noise_sample_params.sample_steps = -1;
+        fprintf(stderr, "error: the high_noise_sample_steps must be greater than 0\n");
        exit(1);
    }
    if (params.strength < 0.f || params.strength > 1.f) {
@ -1186,7 +1175,6 @@ int main(int argc, const char* argv[]) {
        params.chroma_use_dit_mask,
        params.chroma_use_t5_mask,
        params.chroma_t5_mask_pad,
        params.flow_shift,
    };
    sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
@ -1234,7 +1222,6 @@ int main(int argc, const char* argv[]) {
            params.height,
            params.sample_params,
            params.high_noise_sample_params,
            params.moe_boundary,
            params.strength,
            params.seed,
            params.video_frames,
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -681,11 +681,7 @@ public:
        if (sd_version_is_sd3(version)) {
            LOG_INFO("running in FLOW mode");
-            float shift = sd_ctx_params->flow_shift;
+            denoiser = std::make_shared<DiscreteFlowDenoiser>();
            if (shift == INFINITY) {
                shift = 3.0;
            }
            denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
        } else if (sd_version_is_flux(version)) {
            LOG_INFO("running in Flux FLOW mode");
            float shift = 1.0f;  // TODO: validate
@ -698,11 +694,7 @@ public:
            denoiser = std::make_shared<FluxFlowDenoiser>(shift);
        } else if (sd_version_is_wan(version)) {
            LOG_INFO("running in FLOW mode");
-            float shift = sd_ctx_params->flow_shift;
+            denoiser = std::make_shared<DiscreteFlowDenoiser>();
            if (shift == INFINITY) {
                shift = 5.0;
            }
            denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
        } else if (is_using_v_parameterization) {
            LOG_INFO("running in v-prediction mode");
            denoiser = std::make_shared<CompVisVDenoiser>();
@ -1561,7 +1553,6 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
    sd_ctx_params->chroma_use_dit_mask     = true;
    sd_ctx_params->chroma_use_t5_mask      = false;
    sd_ctx_params->chroma_t5_mask_pad      = 1;
    sd_ctx_params->flow_shift              = INFINITY;
 }
 char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
@ -1736,13 +1727,11 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
    memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
    sd_sample_params_init(&sd_vid_gen_params->sample_params);
    sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params);
    sd_vid_gen_params->high_noise_sample_params.sample_steps = -1;
    sd_vid_gen_params->width        = 512;
    sd_vid_gen_params->height       = 512;
    sd_vid_gen_params->strength     = 0.75f;
    sd_vid_gen_params->seed         = -1;
    sd_vid_gen_params->video_frames = 6;
    sd_vid_gen_params->moe_boundary                          = 0.875f;
 }
 struct sd_ctx_t {
@ -2392,24 +2381,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
        high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps;
    }
-    int total_steps = sample_steps;
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps + high_noise_sample_steps);
    if (high_noise_sample_steps > 0) {
        total_steps += high_noise_sample_steps;
    }
    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps);
    if (high_noise_sample_steps < 0) {
        // timesteps ∝ sigmas for Flow models (like wan2.2 a14b)
        for (size_t i = 0; i < sigmas.size(); ++i) {
            if (sigmas[i] < sd_vid_gen_params->moe_boundary) {
                high_noise_sample_steps = i;
                break;
            }
        }
        LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps);
        sample_steps = total_steps - high_noise_sample_steps;
    }
    struct ggml_init_params params;
    params.mem_size = static_cast<size_t>(200 * 1024) * 1024;  // 200 MB
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -142,7 +142,6 @@ typedef struct {
    bool chroma_use_dit_mask;
    bool chroma_use_t5_mask;
    int chroma_t5_mask_pad;
    float flow_shift;
 } sd_ctx_params_t;
 typedef struct {
@ -206,7 +205,6 @@ typedef struct {
    int height;
    sd_sample_params_t sample_params;
    sd_sample_params_t high_noise_sample_params;
    float moe_boundary;
    float strength;
    int64_t seed;
    int video_frames;