perf: cap planner budget when model dwarfs the streaming budget (#1612)

2026-06-10 00:06:38 +00:00 · 2026-06-08 15:53:54 +02:00 · 2026-06-08 15:53:54 +02:00 · 17a2b4a315
commit 17a2b4a315
parent b3d56d0ba1
1 changed files with 17 additions and 1 deletions
--- a/src/core/ggml_extend.hpp
+++ b/src/core/ggml_extend.hpp
@ -2470,10 +2470,26 @@ protected:
            *effective_budget_out = effective_budget;
        }
        // When streaming and the model dwarfs the budget, cap the planner at
        // a quarter so it builds smaller merged segments and chunk-K can fit
        // alongside. Without streaming the cap only adds dispatch overhead.
        size_t planner_budget = effective_budget;
        if (stream_layers_enabled) {
            size_t total_params_bytes = 0;
            for (const ggml_tensor* t : params_tensor_set_) {
                if (t != nullptr) {
                    total_params_bytes += ggml_nbytes(t);
                }
            }
            if (total_params_bytes * 4 > effective_budget * 3) {
                planner_budget = effective_budget / 4;
            }
        }
        *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend,
                                                     gf,
                                                     &graph_cut_plan_cache_,
-                                                     effective_budget,
+                                                     planner_budget,
                                                     params_tensor_set_,
                                                     get_desc().c_str());
        if (stream_layers_enabled) {