chore: set release tag by commit count

refactor: simplify DPM++ (2S) Ancestral (#667 )
refactor: move tiling cacl and debug print into the tiling code branch (#833 )
2025-12-13 05:48:56 +00:00 · 2025-09-16 23:24:36 +08:00 · 2025-09-16 23:05:25 +08:00 · 2025-09-16 22:46:56 +08:00 · 2025-09-16 22:42:09 +08:00
6 changed files with 143 additions and 38 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -296,6 +296,10 @@ jobs:
          pattern: sd-*
          merge-multiple: true

+      - name: Get commit count
+        id: commit_count
+        run: echo "count=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+
      - name: Get commit hash
        id: commit
        uses: pr-mpt/actions-commit-hash@v2
@ -306,7 +310,10 @@ jobs:
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
-          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
+          tag_name: >
+            ${{ github.ref_name == 'master' &&
+                format('release_{0}_{1}', steps.commit_count.outputs.count, steps.commit.outputs.short) ||
+                format('{0}-{1}', env.BRANCH_NAME, steps.commit.outputs.short) }}

      - name: Upload release
        id: upload_release
--- a/README.md
+++ b/README.md
@ -326,9 +326,10 @@ arguments:
  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])
  --skip-layer-start START           SLG enabling point: (default: 0.01)
  --skip-layer-end END               SLG disabling point: (default: 0.2)
-  --scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete)
+  --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
                                     sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise)
+  --timestep-shift N                 shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
  --steps  STEPS                     number of sample steps (default: 20)
  --high-noise-cfg-scale SCALE       (high noise) unconditional guidance scale: (default: 7.0)
  --high-noise-img-cfg-scale SCALE   (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
@ -339,7 +340,7 @@ arguments:
  --high-noise-skip-layers LAYERS    (high noise) Layers to skip for SLG steps: (default: [7,8,9])
  --high-noise-skip-layer-start      (high noise) SLG enabling point: (default: 0.01)
  --high-noise-skip-layer-end END    (high noise) SLG disabling point: (default: 0.2)
-  --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete)
+  --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
                                     (high noise) sampling method (default: "euler_a")
  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)
@ -352,7 +353,7 @@ arguments:
  --rng {std_default, cuda}          RNG (default: cuda)
  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
  -b, --batch-count COUNT            number of images to generate
-  --clip-skip N                      ignore last_dot_pos layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
+  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
  --vae-tiling                       process vae in tiles to reduce memory usage
  --vae-tile-size [X]x[Y]            tile size for vae tiling (default: 32x32)
--- a/denoiser.hpp
+++ b/denoiser.hpp
@ -232,6 +232,25 @@ struct GITSSchedule : SigmaSchedule {
    }
 };

+struct SGMUniformSchedule : SigmaSchedule {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min_in, float sigma_max_in, t_to_sigma_t t_to_sigma_func) override {
+        std::vector<float> result;
+        if (n == 0) {
+            result.push_back(0.0f);
+            return result;
+        }
+        result.reserve(n + 1);
+        int t_max                    = TIMESTEPS - 1;
+        int t_min                    = 0;
+        std::vector<float> timesteps = linear_space(static_cast<float>(t_max), static_cast<float>(t_min), n + 1);
+        for (int i = 0; i < n; i++) {
+            result.push_back(t_to_sigma_func(timesteps[i]));
+        }
+        result.push_back(0.0f);
+        return result;
+    }
+};
+
 struct KarrasSchedule : SigmaSchedule {
    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
        // These *COULD* be function arguments here,
@ -251,6 +270,35 @@ struct KarrasSchedule : SigmaSchedule {
    }
 };

+struct SimpleSchedule : SigmaSchedule {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
+        std::vector<float> result_sigmas;
+
+        if (n == 0) {
+            return result_sigmas;
+        }
+
+        result_sigmas.reserve(n + 1);
+
+        int model_sigmas_len = TIMESTEPS;
+
+        float step_factor = static_cast<float>(model_sigmas_len) / static_cast<float>(n);
+
+        for (uint32_t i = 0; i < n; ++i) {
+            int offset_from_start_of_py_array = static_cast<int>(static_cast<float>(i) * step_factor);
+            int timestep_index                = model_sigmas_len - 1 - offset_from_start_of_py_array;
+
+            if (timestep_index < 0) {
+                timestep_index = 0;
+            }
+
+            result_sigmas.push_back(t_to_sigma(static_cast<float>(timestep_index)));
+        }
+        result_sigmas.push_back(0.0f);
+        return result_sigmas;
+    }
+};
+
 // Close to Beta Schedule, but increadably simple in code.
 struct SmoothStepSchedule : SigmaSchedule {
    static constexpr float smoothstep(float x) {
@ -722,7 +770,6 @@ static void sample_k_diffusion(sample_method_t method,
        } break;
        case DPMPP2S_A: {
            struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-            struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
            struct ggml_tensor* x2    = ggml_dup_tensor(work_ctx, x);

            for (int i = 0; i < steps; i++) {
@ -737,22 +784,15 @@ static void sample_k_diffusion(sample_method_t method,
                auto sigma_fn    = [](float t) -> float { return exp(-t); };

                if (sigma_down == 0) {
-                    // Euler step
-                    float* vec_d        = (float*)d->data;
+                    // d = (x - denoised) / sigmas[i];
+                    // dt = sigma_down - sigmas[i];
+                    // x += d * dt;
+                    // => x = denoised
                    float* vec_x        = (float*)x->data;
                    float* vec_denoised = (float*)denoised->data;

-                    for (int j = 0; j < ggml_nelements(d); j++) {
-                        vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
-                    }
-
-                    // TODO: If sigma_down == 0, isn't this wrong?
-                    // But
-                    // https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py#L525
-                    // has this exactly the same way.
-                    float dt = sigma_down - sigmas[i];
-                    for (int j = 0; j < ggml_nelements(d); j++) {
-                        vec_x[j] = vec_x[j] + vec_d[j] * dt;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_x[j] = vec_denoised[j];
                    }
                } else {
                    // DPM-Solver++(2S)
@ -761,7 +801,6 @@ static void sample_k_diffusion(sample_method_t method,
                    float h      = t_next - t;
                    float s      = t + 0.5f * h;

-                    float* vec_d        = (float*)d->data;
                    float* vec_x        = (float*)x->data;
                    float* vec_x2       = (float*)x2->data;
                    float* vec_denoised = (float*)denoised->data;
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -248,9 +248,10 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])\n");
    printf("  --skip-layer-start START           SLG enabling point: (default: 0.01)\n");
    printf("  --skip-layer-end END               SLG disabling point: (default: 0.2)\n");
-    printf("  --scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete)\n");
+    printf("  --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n");
    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
    printf("                                     sampling method (default: \"euler\" for Flux/SD3/Wan, \"euler_a\" otherwise)\n");
+    printf("  --timestep-shift N                 shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant\n");
    printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
    printf("  --high-noise-cfg-scale SCALE       (high noise) unconditional guidance scale: (default: 7.0)\n");
    printf("  --high-noise-img-cfg-scale SCALE   (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
@ -261,7 +262,7 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --high-noise-skip-layers LAYERS    (high noise) Layers to skip for SLG steps: (default: [7,8,9])\n");
    printf("  --high-noise-skip-layer-start      (high noise) SLG enabling point: (default: 0.01)\n");
    printf("  --high-noise-skip-layer-end END    (high noise) SLG disabling point: (default: 0.2)\n");
-    printf("  --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete)\n");
+    printf("  --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n");
    printf("  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
    printf("                                     (high noise) sampling method (default: \"euler_a\")\n");
    printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)\n");
@ -274,7 +275,7 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
    printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
    printf("  -b, --batch-count COUNT            number of images to generate\n");
-    printf("  --clip-skip N                      ignore last_dot_pos layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
+    printf("  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
    printf("                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
    printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
    printf("  --vae-tile-size [X]x[Y]            tile size for vae tiling (default: 32x32)\n");
@ -520,6 +521,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        {"", "--chroma-t5-mask-pad", "", &params.chroma_t5_mask_pad},
        {"", "--video-frames", "", &params.video_frames},
        {"", "--fps", "", &params.fps},
+        {"", "--timestep-shift", "", &params.sample_params.shifted_timestep},
    };

    options.float_options = {
@ -875,6 +877,11 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        exit(1);
    }

+    if (params.sample_params.shifted_timestep < 0 || params.sample_params.shifted_timestep > 1000) {
+        fprintf(stderr, "error: timestep-shift must be between 0 and 1000\n");
+        exit(1);
+    }
+
    if (params.upscale_repeats < 1) {
        fprintf(stderr, "error: upscale multiplier must be at least 1\n");
        exit(1);
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -747,6 +747,16 @@ public:
                denoiser->scheduler          = std::make_shared<GITSSchedule>();
                denoiser->scheduler->version = version;
                break;
+            case SGM_UNIFORM:
+                    LOG_INFO("Running with SGM Uniform schedule");
+                    denoiser->scheduler          = std::make_shared<SGMUniformSchedule>();
+                    denoiser->scheduler->version = version;
+                    break;
+            case SIMPLE:
+                    LOG_INFO("Running with Simple schedule");
+                    denoiser->scheduler          = std::make_shared<SimpleSchedule>();
+                    denoiser->scheduler->version = version;
+                    break;
            case SMOOTHSTEP:
                LOG_INFO("Running with SmoothStep scheduler");
                denoiser->scheduler = std::make_shared<SmoothStepSchedule>();
@ -1033,6 +1043,7 @@ public:
                        float control_strength,
                        sd_guidance_params_t guidance,
                        float eta,
+                        int shifted_timestep,
                        sample_method_t method,
                        const std::vector<float>& sigmas,
                        int start_merge_step,
@ -1042,6 +1053,10 @@ public:
                        ggml_tensor* denoise_mask             = NULL,
                        ggml_tensor* vace_context             = NULL,
                        float vace_strength                   = 1.f) {
+         if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) {
+            LOG_WARN("timestep shifting is only supported for SDXL models!");
+            shifted_timestep = 0;
+        }
        std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);

        float cfg_scale     = guidance.txt_cfg;
@ -1102,7 +1117,17 @@ public:
            float c_in   = scaling[2];

            float t = denoiser->sigma_to_t(sigma);
-            std::vector<float> timesteps_vec(1, t);  // [N, ]
+            std::vector<float> timesteps_vec;
+            if (shifted_timestep > 0 && sd_version_is_sdxl(version)) {
+                float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS));
+                int64_t shifted_t     = static_cast<int64_t>(roundf(shifted_t_float));
+                shifted_t             = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t));
+                LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma);
+                timesteps_vec.assign(1, (float)shifted_t);
+            } else {
+                timesteps_vec.assign(1, t);
+            }
+            
            timesteps_vec  = process_timesteps(timesteps_vec, init_latent, denoise_mask);
            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
            std::vector<float> guidance_vec(1, guidance.distilled_guidance);
@ -1200,6 +1225,19 @@ public:
            float* vec_input     = (float*)input->data;
            float* positive_data = (float*)out_cond->data;
            int ne_elements      = (int)ggml_nelements(denoised);
+
+            if (shifted_timestep > 0 && sd_version_is_sdxl(version)) {
+                int64_t shifted_t_idx              = static_cast<int64_t>(roundf(timesteps_vec[0]));
+                float shifted_sigma                = denoiser->t_to_sigma((float)shifted_t_idx);
+                std::vector<float> shifted_scaling = denoiser->get_scalings(shifted_sigma);
+                float shifted_c_skip               = shifted_scaling[0];
+                float shifted_c_out                = shifted_scaling[1];
+                float shifted_c_in                 = shifted_scaling[2];
+
+                c_skip = shifted_c_skip * c_in / shifted_c_in;
+                c_out  = shifted_c_out;
+            }
+
            for (int i = 0; i < ne_elements; i++) {
                float latent_result = positive_data[i];
                if (has_unconditioned) {
@ -1222,6 +1260,7 @@ public:
                // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
                vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
            }
+
            int64_t t1 = ggml_time_us();
            if (step > 0) {
                pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
@ -1323,15 +1362,15 @@ public:
        }

        if (!use_tiny_autoencoder) {
-            float tile_overlap;
-            int tile_size_x, tile_size_y;
-            // multiply tile size for encode to keep the compute buffer size consistent
-            get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, W, H, 1.30539f);
-
-            LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
-
            process_vae_input_tensor(x);
            if (vae_tiling_params.enabled && !encode_video) {
+                float tile_overlap;
+                int tile_size_x, tile_size_y;
+                // multiply tile size for encode to keep the compute buffer size consistent
+                get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, W, H, 1.30539f);
+
+                LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
+
                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                    first_stage_model->compute(n_threads, in, false, &out, work_ctx);
                };
@ -1468,15 +1507,15 @@ public:
        }
        int64_t t0 = ggml_time_ms();
        if (!use_tiny_autoencoder) {
-            float tile_overlap;
-            int tile_size_x, tile_size_y;
-            get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, x->ne[0], x->ne[1]);
-
-            LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
-
            process_latent_out(x);
            // x = load_tensor_from_file(work_ctx, "wan_vae_z.bin");
            if (vae_tiling_params.enabled && !decode_video) {
+                float tile_overlap;
+                int tile_size_x, tile_size_y;
+                get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, x->ne[0], x->ne[1]);
+
+                LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
+
                // split latent in 32x32 tiles and compute in several steps
                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                    first_stage_model->compute(n_threads, in, true, &out, NULL);
@ -1588,6 +1627,8 @@ const char* schedule_to_str[] = {
    "exponential",
    "ays",
    "gits",
+    "sgm_uniform",
+    "simple",
    "smoothstep",
 };

@ -1720,7 +1761,8 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
             "scheduler: %s, "
             "sample_method: %s, "
             "sample_steps: %d, "
-             "eta: %.2f)",
+             "eta: %.2f, "
+             "shifted_timestep: %d)",
             sample_params->guidance.txt_cfg,
             sample_params->guidance.img_cfg,
             sample_params->guidance.distilled_guidance,
@ -1731,7 +1773,8 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
             sd_schedule_name(sample_params->scheduler),
             sd_sample_method_name(sample_params->sample_method),
             sample_params->sample_steps,
-             sample_params->eta);
+             sample_params->eta,
+             sample_params->shifted_timestep);

    return buf;
 }
@ -1863,6 +1906,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                    int clip_skip,
                                    sd_guidance_params_t guidance,
                                    float eta,
+                                    int shifted_timestep,
                                    int width,
                                    int height,
                                    enum sample_method_t sample_method,
@ -2101,6 +2145,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                                     control_strength,
                                                     guidance,
                                                     eta,
+                                                     shifted_timestep,
                                                     sample_method,
                                                     sigmas,
                                                     start_merge_step,
@ -2394,6 +2439,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                                        sd_img_gen_params->clip_skip,
                                                        sd_img_gen_params->sample_params.guidance,
                                                        sd_img_gen_params->sample_params.eta,
+                                                        sd_img_gen_params->sample_params.shifted_timestep,
                                                        width,
                                                        height,
                                                        sample_method,
@ -2734,6 +2780,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                                 0,
                                 sd_vid_gen_params->high_noise_sample_params.guidance,
                                 sd_vid_gen_params->high_noise_sample_params.eta,
+                                 sd_vid_gen_params->high_noise_sample_params.shifted_timestep,
                                 sd_vid_gen_params->high_noise_sample_params.sample_method,
                                 high_noise_sigmas,
                                 -1,
@ -2769,6 +2816,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                                          0,
                                          sd_vid_gen_params->sample_params.guidance,
                                          sd_vid_gen_params->sample_params.eta,
+                                          sd_vid_gen_params->sample_params.shifted_timestep,
                                          sd_vid_gen_params->sample_params.sample_method,
                                          sigmas,
                                          -1,
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -58,6 +58,8 @@ enum scheduler_t {
    EXPONENTIAL,
    AYS,
    GITS,
+    SGM_UNIFORM,
+    SIMPLE,
    SMOOTHSTEP,
    SCHEDULE_COUNT
 };
@ -183,6 +185,7 @@ typedef struct {
    enum sample_method_t sample_method;
    int sample_steps;
    float eta;
+    int shifted_timestep;
 } sd_sample_params_t;

 typedef struct {
Author	SHA1	Message	Date
leejet	79426d578e	chore: set release tag by commit count	2025-09-16 23:24:36 +08:00
vmobilis	97ad3e7ff9	refactor: simplify DPM++ (2S) Ancestral (#667 )	2025-09-16 23:05:25 +08:00
Erik Scholz	8909523e92	refactor: move tiling cacl and debug print into the tiling code branch (#833 )	2025-09-16 22:46:56 +08:00
rmatif	8376dfba2a	feat: add sgm_uniform scheduler, simple scheduler, and support for NitroFusion (#675 ) * feat: Add timestep shift and two new schedulers * update readme * fix spaces * format code * simplify SGMUniformSchedule * simplify shifted_timestep logic * avoid conflict --------- Co-authored-by: leejet <leejet714@gmail.com>	2025-09-16 22:42:09 +08:00