2026-06-25 15:46:40 +00:00
6 changed files with 13 additions and 331 deletions
--- a/docs/caching.md
+++ b/docs/caching.md
@ -11,7 +11,6 @@ Caching methods accelerate diffusion inference by reusing intermediate computati
 | `dbcache` | DiT models | Block-level L1 residual threshold |
 | `taylorseer` | DiT models | Taylor series approximation |
 | `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
-| `spectrum` | UNET models | Chebyshev + Taylor output forecasting |

 ### UCache (UNET Models)

@ -119,28 +118,6 @@ Mask values: `1` = compute, `0` = can cache.
 --scm-policy dynamic
 ```

-### Spectrum (UNET Models)
-
-Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire UNet forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum).
-
-```bash
-sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
-```
-
-#### Parameters
-
-| Parameter | Description | Default |
-|-----------|-------------|---------|
-| `w` | Chebyshev vs Taylor blend weight (0=Taylor, 1=Chebyshev) | 0.40 |
-| `m` | Chebyshev polynomial degree | 3 |
-| `lam` | Ridge regression regularization | 1.0 |
-| `window` | Initial window size (compute every N steps) | 2 |
-| `flex` | Window growth per computed step after warmup | 0.50 |
-| `warmup` | Steps to always compute before caching starts | 4 |
-| `stop` | Stop caching at this fraction of total steps | 0.9 |
-
-```
-
 ### Performance Tips

 - Start with default thresholds and adjust based on output quality
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -138,12 +138,10 @@ Generation Options:
  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
-  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
-                                           'spectrum' (UNET Chebyshev+Taylor forecasting)
+  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)
  --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
-                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
-                                           spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
-                                           "threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
+                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
+                                           "threshold=0.25" or "threshold=1.5,reset=0"
  --cache-preset                           cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'
  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
  --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@ -1422,8 +1422,8 @@ struct SDGenerationParams {
            }
            cache_mode = argv_to_utf8(index, argv);
            if (cache_mode != "easycache" && cache_mode != "ucache" &&
-                cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit" && cache_mode != "spectrum") {
-                fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', 'cache-dit', or 'spectrum'\n", cache_mode.c_str());
+                cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit") {
+                fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', or 'cache-dit'\n", cache_mode.c_str());
                return -1;
            }
            return 1;
@ -1779,23 +1779,7 @@ struct SDGenerationParams {
                    } else if (key == "Bn" || key == "bn") {
                        cache_params.Bn_compute_blocks = std::stoi(val);
                    } else if (key == "warmup") {
-                        if (cache_mode == "spectrum") {
-                            cache_params.spectrum_warmup_steps = std::stoi(val);
-                        } else {
                        cache_params.max_warmup_steps = std::stoi(val);
-                        }
-                    } else if (key == "w") {
-                        cache_params.spectrum_w = std::stof(val);
-                    } else if (key == "m") {
-                        cache_params.spectrum_m = std::stoi(val);
-                    } else if (key == "lam") {
-                        cache_params.spectrum_lam = std::stof(val);
-                    } else if (key == "window") {
-                        cache_params.spectrum_window_size = std::stoi(val);
-                    } else if (key == "flex") {
-                        cache_params.spectrum_flex_window = std::stof(val);
-                    } else if (key == "stop") {
-                        cache_params.spectrum_stop_percent = std::stof(val);
                    } else {
                        LOG_ERROR("error: unknown cache parameter '%s'", key.c_str());
                        return false;
@ -1843,15 +1827,6 @@ struct SDGenerationParams {
                cache_params.Bn_compute_blocks       = 0;
                cache_params.residual_diff_threshold = 0.08f;
                cache_params.max_warmup_steps        = 8;
-            } else if (cache_mode == "spectrum") {
-                cache_params.mode                  = SD_CACHE_SPECTRUM;
-                cache_params.spectrum_w            = 0.40f;
-                cache_params.spectrum_m            = 3;
-                cache_params.spectrum_lam          = 1.0f;
-                cache_params.spectrum_window_size  = 2;
-                cache_params.spectrum_flex_window  = 0.50f;
-                cache_params.spectrum_warmup_steps = 4;
-                cache_params.spectrum_stop_percent = 0.9f;
            }

            if (!cache_option.empty()) {
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -251,7 +251,6 @@ enum sd_cache_mode_t {
    SD_CACHE_DBCACHE,
    SD_CACHE_TAYLORSEER,
    SD_CACHE_CACHE_DIT,
-    SD_CACHE_SPECTRUM,
 };

 typedef struct {
@ -272,13 +271,6 @@ typedef struct {
    int taylorseer_skip_interval;
    const char* scm_mask;
    bool scm_policy_dynamic;
-    float spectrum_w;
-    int spectrum_m;
-    float spectrum_lam;
-    int spectrum_window_size;
-    float spectrum_flex_window;
-    int spectrum_warmup_steps;
-    float spectrum_stop_percent;
 } sd_cache_params_t;

 typedef struct {
--- a/src/spectrum.hpp
+++ b/src/spectrum.hpp
@ -1,195 +0,0 @@
-#ifndef __SPECTRUM_HPP__
-#define __SPECTRUM_HPP__
-
-#include <cmath>
-#include <cstring>
-#include <vector>
-
-#include "ggml_extend.hpp"
-
-struct SpectrumConfig {
-    float w            = 0.40f;
-    int m              = 3;
-    float lam          = 1.0f;
-    int window_size    = 2;
-    float flex_window  = 0.50f;
-    int warmup_steps   = 4;
-    float stop_percent = 0.9f;
-};
-
-struct SpectrumState {
-    SpectrumConfig config;
-    int cnt                 = 0;
-    int num_cached          = 0;
-    float curr_ws           = 2.0f;
-    int K                   = 6;
-    int stop_step           = 0;
-    int total_steps_skipped = 0;
-
-    std::vector<std::vector<float>> H_buf;
-    std::vector<float> T_buf;
-
-    void init(const SpectrumConfig& cfg, size_t total_steps) {
-        config              = cfg;
-        cnt                 = 0;
-        num_cached          = 0;
-        curr_ws             = (float)cfg.window_size;
-        K                   = std::max(cfg.m + 1, 6);
-        stop_step           = (int)(cfg.stop_percent * (float)total_steps);
-        total_steps_skipped = 0;
-        H_buf.clear();
-        T_buf.clear();
-    }
-
-    float taus(int step_cnt) const {
-        return (step_cnt / 50.0f) * 2.0f - 1.0f;
-    }
-
-    bool should_predict() {
-        if (cnt < config.warmup_steps)
-            return false;
-        if (stop_step > 0 && cnt >= stop_step)
-            return false;
-        if ((int)H_buf.size() < 2)
-            return false;
-
-        int ws = std::max(1, (int)std::floor(curr_ws));
-        return (num_cached + 1) % ws != 0;
-    }
-
-    void update(const struct ggml_tensor* denoised) {
-        int64_t ne        = ggml_nelements(denoised);
-        const float* data = (const float*)denoised->data;
-
-        H_buf.emplace_back(data, data + ne);
-        T_buf.push_back(taus(cnt));
-
-        while ((int)H_buf.size() > K) {
-            H_buf.erase(H_buf.begin());
-            T_buf.erase(T_buf.begin());
-        }
-
-        if (cnt >= config.warmup_steps)
-            curr_ws += config.flex_window;
-
-        num_cached = 0;
-        cnt++;
-    }
-
-    void predict(struct ggml_tensor* denoised) {
-        int64_t F    = (int64_t)H_buf[0].size();
-        int K_curr   = (int)H_buf.size();
-        int M1       = config.m + 1;
-        float tau_at = taus(cnt);
-
-        // Design matrix X: K_curr x M1 (Chebyshev basis)
-        std::vector<float> X(K_curr * M1);
-        for (int i = 0; i < K_curr; i++) {
-            X[i * M1] = 1.0f;
-            if (M1 > 1)
-                X[i * M1 + 1] = T_buf[i];
-            for (int j = 2; j < M1; j++)
-                X[i * M1 + j] = 2.0f * T_buf[i] * X[i * M1 + j - 1] - X[i * M1 + j - 2];
-        }
-
-        // x_star: Chebyshev basis at current tau
-        std::vector<float> x_star(M1);
-        x_star[0] = 1.0f;
-        if (M1 > 1)
-            x_star[1] = tau_at;
-        for (int j = 2; j < M1; j++)
-            x_star[j] = 2.0f * tau_at * x_star[j - 1] - x_star[j - 2];
-
-        // XtX = X^T X + lambda I
-        std::vector<float> XtX(M1 * M1, 0.0f);
-        for (int i = 0; i < M1; i++) {
-            for (int j = 0; j < M1; j++) {
-                float sum = 0.0f;
-                for (int k = 0; k < K_curr; k++)
-                    sum += X[k * M1 + i] * X[k * M1 + j];
-                XtX[i * M1 + j] = sum + (i == j ? config.lam : 0.0f);
-            }
-        }
-
-        // Cholesky decomposition
-        std::vector<float> L(M1 * M1, 0.0f);
-        if (!cholesky_decompose(XtX.data(), L.data(), M1)) {
-            float trace = 0.0f;
-            for (int i = 0; i < M1; i++)
-                trace += XtX[i * M1 + i];
-            for (int i = 0; i < M1; i++)
-                XtX[i * M1 + i] += 1e-4f * trace / M1;
-            cholesky_decompose(XtX.data(), L.data(), M1);
-        }
-
-        // Solve XtX v = x_star
-        std::vector<float> v(M1);
-        cholesky_solve(L.data(), x_star.data(), v.data(), M1);
-
-        // Prediction weights per history entry
-        std::vector<float> weights(K_curr, 0.0f);
-        for (int k = 0; k < K_curr; k++)
-            for (int j = 0; j < M1; j++)
-                weights[k] += X[k * M1 + j] * v[j];
-
-        // Blend Chebyshev and Taylor predictions
-        float* out          = (float*)denoised->data;
-        float w_cheb        = config.w;
-        float w_taylor      = 1.0f - w_cheb;
-        const float* h_last = H_buf.back().data();
-        const float* h_prev = H_buf[H_buf.size() - 2].data();
-
-        for (int64_t f = 0; f < F; f++) {
-            float pred_cheb = 0.0f;
-            for (int k = 0; k < K_curr; k++)
-                pred_cheb += weights[k] * H_buf[k][f];
-
-            float pred_taylor = h_last[f] + 0.5f * (h_last[f] - h_prev[f]);
-
-            out[f] = w_taylor * pred_taylor + w_cheb * pred_cheb;
-        }
-
-        num_cached++;
-        total_steps_skipped++;
-        cnt++;
-    }
-
-private:
-    static bool cholesky_decompose(const float* A, float* L, int n) {
-        std::memset(L, 0, n * n * sizeof(float));
-        for (int i = 0; i < n; i++) {
-            for (int j = 0; j <= i; j++) {
-                float sum = 0.0f;
-                for (int k = 0; k < j; k++)
-                    sum += L[i * n + k] * L[j * n + k];
-                if (i == j) {
-                    float diag = A[i * n + i] - sum;
-                    if (diag <= 0.0f)
-                        return false;
-                    L[i * n + j] = std::sqrt(diag);
-                } else {
-                    L[i * n + j] = (A[i * n + j] - sum) / L[j * n + j];
-                }
-            }
-        }
-        return true;
-    }
-
-    static void cholesky_solve(const float* L, const float* b, float* x, int n) {
-        std::vector<float> y(n);
-        for (int i = 0; i < n; i++) {
-            float sum = 0.0f;
-            for (int j = 0; j < i; j++)
-                sum += L[i * n + j] * y[j];
-            y[i] = (b[i] - sum) / L[i * n + i];
-        }
-        for (int i = n - 1; i >= 0; i--) {
-            float sum = 0.0f;
-            for (int j = i + 1; j < n; j++)
-                sum += L[j * n + i] * x[j];
-            x[i] = (y[i] - sum) / L[i * n + i];
-        }
-    }
-};
-
-#endif  // __SPECTRUM_HPP__
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -16,7 +16,6 @@
 #include "esrgan.hpp"
 #include "lora.hpp"
 #include "pmid.hpp"
-#include "spectrum.hpp"
 #include "tae.hpp"
 #include "ucache.hpp"
 #include "vae.hpp"
@ -1688,11 +1687,9 @@ public:
        EasyCacheState easycache_state;
        UCacheState ucache_state;
        CacheDitConditionState cachedit_state;
-        SpectrumState spectrum_state;
        bool easycache_enabled = false;
        bool ucache_enabled    = false;
        bool cachedit_enabled  = false;
-        bool spectrum_enabled  = false;

        if (cache_params != nullptr && cache_params->mode != SD_CACHE_DISABLED) {
            bool percent_valid = true;
@ -1796,27 +1793,6 @@ public:
                        LOG_WARN("CacheDIT requested but could not be initialized for this run");
                    }
                }
-            } else if (cache_params->mode == SD_CACHE_SPECTRUM) {
-                bool spectrum_supported = sd_version_is_unet(version);
-                if (!spectrum_supported) {
-                    LOG_WARN("Spectrum requested but not supported for this model type (only UNET models)");
-                } else {
-                    SpectrumConfig spectrum_config;
-                    spectrum_config.w            = cache_params->spectrum_w;
-                    spectrum_config.m            = cache_params->spectrum_m;
-                    spectrum_config.lam          = cache_params->spectrum_lam;
-                    spectrum_config.window_size  = cache_params->spectrum_window_size;
-                    spectrum_config.flex_window  = cache_params->spectrum_flex_window;
-                    spectrum_config.warmup_steps = cache_params->spectrum_warmup_steps;
-                    spectrum_config.stop_percent = cache_params->spectrum_stop_percent;
-                    size_t total_steps           = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
-                    spectrum_state.init(spectrum_config, total_steps);
-                    spectrum_enabled = true;
-                    LOG_INFO("Spectrum enabled - w: %.2f, m: %d, lam: %.2f, window: %d, flex: %.2f, warmup: %d, stop: %.0f%%",
-                             spectrum_config.w, spectrum_config.m, spectrum_config.lam,
-                             spectrum_config.window_size, spectrum_config.flex_window,
-                             spectrum_config.warmup_steps, spectrum_config.stop_percent * 100.0f);
-                }
            }
        }

@ -2040,28 +2016,6 @@ public:
            }

            timesteps_vec  = process_timesteps(timesteps_vec, init_latent, denoise_mask);
-
-            if (spectrum_enabled && spectrum_state.should_predict()) {
-                spectrum_state.predict(denoised);
-
-                if (denoise_mask != nullptr) {
-                    apply_mask(denoised, init_latent, denoise_mask);
-                }
-
-                if (sd_preview_cb != nullptr && sd_should_preview_denoised()) {
-                    if (step % sd_get_preview_interval() == 0) {
-                        preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false);
-                    }
-                }
-
-                int64_t t1 = ggml_time_us();
-                if (step > 0 || step == -(int)steps) {
-                    int showstep = std::abs(step);
-                    pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep);
-                }
-                return denoised;
-            }
-
            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
            std::vector<float> guidance_vec(1, guidance.distilled_guidance);
            auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
@ -2235,10 +2189,6 @@ public:
                vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
            }

-            if (spectrum_enabled) {
-                spectrum_state.update(denoised);
-            }
-
            if (denoise_mask != nullptr) {
                apply_mask(denoised, init_latent, denoise_mask);
            }
@ -2330,14 +2280,6 @@ public:
            }
        }

-        if (spectrum_enabled && spectrum_state.total_steps_skipped > 0) {
-            size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
-            double speedup     = static_cast<double>(total_steps) /
-                             static_cast<double>(total_steps - spectrum_state.total_steps_skipped);
-            LOG_INFO("Spectrum skipped %d/%zu steps (%.2fx estimated speedup)",
-                     spectrum_state.total_steps_skipped, total_steps, speedup);
-        }
-
        if (inverse_noise_scaling) {
            x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
        }
@ -2584,14 +2526,14 @@ public:
        tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y);
    }

-    ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x) {
+    ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
        int64_t t0                 = ggml_time_ms();
        ggml_tensor* result        = nullptr;
        const int vae_scale_factor = get_vae_scale_factor();
        int64_t W                  = x->ne[0] / vae_scale_factor;
        int64_t H                  = x->ne[1] / vae_scale_factor;
        int64_t C                  = get_latent_channel();
-        if (vae_tiling_params.enabled) {
+        if (vae_tiling_params.enabled && !encode_video) {
            // TODO wan2.2 vae support?
            int64_t ne2;
            int64_t ne3;
@ -2619,7 +2561,7 @@ public:

        if (!use_tiny_autoencoder) {
            process_vae_input_tensor(x);
-            if (vae_tiling_params.enabled) {
+            if (vae_tiling_params.enabled && !encode_video) {
                float tile_overlap;
                int tile_size_x, tile_size_y;
                // multiply tile size for encode to keep the compute buffer size consistent
@ -2636,7 +2578,7 @@ public:
            }
            first_stage_model->free_compute_buffer();
        } else {
-            if (vae_tiling_params.enabled) {
+            if (vae_tiling_params.enabled && !encode_video) {
                // split latent in 32x32 tiles and compute in several steps
                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                    return tae_first_stage->compute(n_threads, in, false, &out, nullptr);
@ -2712,8 +2654,8 @@ public:
        return latent;
    }

-    ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
-        ggml_tensor* vae_output = vae_encode(work_ctx, x);
+    ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
+        ggml_tensor* vae_output = vae_encode(work_ctx, x, encode_video);
        return get_first_stage_encoding(work_ctx, vae_output);
    }

@ -2999,13 +2941,6 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) {
    cache_params->taylorseer_skip_interval    = 1;
    cache_params->scm_mask                    = nullptr;
    cache_params->scm_policy_dynamic          = true;
-    cache_params->spectrum_w                  = 0.40f;
-    cache_params->spectrum_m                  = 3;
-    cache_params->spectrum_lam                = 1.0f;
-    cache_params->spectrum_window_size        = 2;
-    cache_params->spectrum_flex_window        = 0.50f;
-    cache_params->spectrum_warmup_steps       = 4;
-    cache_params->spectrum_stop_percent       = 0.9f;
 }

 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {