perf: keep chunk-K residency engaged with runtime LoRA (#1598)

2026-06-09 15:56:39 +00:00 · 2026-06-03 17:12:00 +02:00 · 2026-06-03 17:12:00 +02:00 · a7f2e03da4
commit a7f2e03da4
parent 4513e3fda9
1 changed files with 51 additions and 31 deletions
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@ -2432,12 +2432,17 @@ protected:
        GGML_ASSERT(gf != nullptr);
        // Keep the plan and resident params under the same live-VRAM cap.
        // Add back our own resident buffer so we don't see chunk-K's
        // allocation as "taken" VRAM and shrink the budget on every step.
        size_t effective_budget = max_graph_vram_bytes;
        if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) {
            ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend);
            if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
                size_t free_vram = 0, total_vram = 0;
                ggml_backend_dev_memory(dev, &free_vram, &total_vram);
                if (resident_runtime_params_buffer != nullptr) {
                    free_vram += ggml_backend_buffer_get_size(resident_runtime_params_buffer);
                }
                constexpr size_t safety_margin = 512ull * 1024 * 1024;
                size_t free_clamp              = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
                if (free_clamp < effective_budget) {
@ -2815,39 +2820,54 @@ public:
                                                            bool no_return = false) {
        GGML_ASSERT(gf != nullptr);
-        // Runtime LoRA mutates CPU weights between calls, so resident GPU
+        // Runtime LoRA composes `weight + diff` in the compute graph via
-        // copies would go stale.
+        // ggml_add; the resident weight tensor's data is never mutated, so
-        if (weight_adapter != nullptr) {
+        // chunk-K residency stays valid across sampling steps.
-            restore_resident_params();
+        // Reserve room for the worst merged segment so chunk-K can't grow
-        } else {
+        // large enough to starve later partial-param allocations.
-            sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
+        size_t worst_merged_segment_footprint = 0;
-            if (base_plan.available) {
+        for (const auto& seg : plan.segments) {
-                sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes);
+            const size_t fp = seg.input_param_bytes +
                              seg.compute_buffer_size +
                              seg.output_bytes +
                              seg.input_previous_cut_bytes +
                              seg.input_external_bytes;
            if (fp > worst_merged_segment_footprint) {
                worst_merged_segment_footprint = fp;
            }
        }
        const size_t residency_budget_for_annotate =
            residency_budget_bytes > worst_merged_segment_footprint
                ? residency_budget_bytes - worst_merged_segment_footprint
                : 0;
-                std::vector<ggml_tensor*> resident_params;
+        sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
-                uint64_t token = 0;
+        if (base_plan.available) {
-                for (const auto& segment : base_plan.segments) {
+            sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_for_annotate);
-                    if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) {
+
-                        continue;
+            std::vector<ggml_tensor*> resident_params;
-                    }
+            uint64_t token = 0;
-                    auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment);
+            for (const auto& segment : base_plan.segments) {
-                    for (ggml_tensor* t : seg_params) {
+                if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) {
-                        if (t == nullptr)
+                    continue;
                            continue;
                        resident_params.push_back(t);
                        token ^= reinterpret_cast<uintptr_t>(t) * 0x9E3779B97F4A7C15ull;
                    }
                }
-                if (token != resident_state_token) {
+                auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment);
-                    restore_resident_params();
+                for (ggml_tensor* t : seg_params) {
-                    if (!resident_params.empty()) {
+                    if (t == nullptr)
-                        if (offload_resident_params(resident_params)) {
+                        continue;
-                            resident_state_token = token;
+                    resident_params.push_back(t);
-                        } else {
+                    token ^= reinterpret_cast<uintptr_t>(t) * 0x9E3779B97F4A7C15ull;
-                            LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming",
+                }
-                                      get_desc().c_str());
+            }
-                            restore_resident_params();
+            if (token != resident_state_token) {
-                        }
+                restore_resident_params();
                if (!resident_params.empty()) {
                    if (offload_resident_params(resident_params)) {
                        resident_state_token = token;
                    } else {
                        LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming",
                                  get_desc().c_str());
                        restore_resident_params();
                    }
                }
            }