From 17a2b4a31533193d3d3ee50a0a0ca85403aa53cc Mon Sep 17 00:00:00 2001 From: fszontagh <51741446+fszontagh@users.noreply.github.com> Date: Mon, 8 Jun 2026 15:53:54 +0200 Subject: [PATCH] perf: cap planner budget when model dwarfs the streaming budget (#1612) --- src/core/ggml_extend.hpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp index 7dc37cb9..f78e6cd6 100644 --- a/src/core/ggml_extend.hpp +++ b/src/core/ggml_extend.hpp @@ -2470,10 +2470,26 @@ protected: *effective_budget_out = effective_budget; } + // When streaming and the model dwarfs the budget, cap the planner at + // a quarter so it builds smaller merged segments and chunk-K can fit + // alongside. Without streaming the cap only adds dispatch overhead. + size_t planner_budget = effective_budget; + if (stream_layers_enabled) { + size_t total_params_bytes = 0; + for (const ggml_tensor* t : params_tensor_set_) { + if (t != nullptr) { + total_params_bytes += ggml_nbytes(t); + } + } + if (total_params_bytes * 4 > effective_budget * 3) { + planner_budget = effective_budget / 4; + } + } + *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend, gf, &graph_cut_plan_cache_, - effective_budget, + planner_budget, params_tensor_set_, get_desc().c_str()); if (stream_layers_enabled) {