perf: keep chunk-K residency engaged with runtime LoRA (#1598)

This commit is contained in:
fszontagh 2026-06-03 17:12:00 +02:00 committed by GitHub
parent 4513e3fda9
commit a7f2e03da4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2432,12 +2432,17 @@ protected:
GGML_ASSERT(gf != nullptr);
// Keep the plan and resident params under the same live-VRAM cap.
// Add back our own resident buffer so we don't see chunk-K's
// allocation as "taken" VRAM and shrink the budget on every step.
size_t effective_budget = max_graph_vram_bytes;
if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) {
ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend);
if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
size_t free_vram = 0, total_vram = 0;
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
if (resident_runtime_params_buffer != nullptr) {
free_vram += ggml_backend_buffer_get_size(resident_runtime_params_buffer);
}
constexpr size_t safety_margin = 512ull * 1024 * 1024;
size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
if (free_clamp < effective_budget) {
@ -2815,14 +2820,30 @@ public:
bool no_return = false) {
GGML_ASSERT(gf != nullptr);
// Runtime LoRA mutates CPU weights between calls, so resident GPU
// copies would go stale.
if (weight_adapter != nullptr) {
restore_resident_params();
} else {
// Runtime LoRA composes `weight + diff` in the compute graph via
// ggml_add; the resident weight tensor's data is never mutated, so
// chunk-K residency stays valid across sampling steps.
// Reserve room for the worst merged segment so chunk-K can't grow
// large enough to starve later partial-param allocations.
size_t worst_merged_segment_footprint = 0;
for (const auto& seg : plan.segments) {
const size_t fp = seg.input_param_bytes +
seg.compute_buffer_size +
seg.output_bytes +
seg.input_previous_cut_bytes +
seg.input_external_bytes;
if (fp > worst_merged_segment_footprint) {
worst_merged_segment_footprint = fp;
}
}
const size_t residency_budget_for_annotate =
residency_budget_bytes > worst_merged_segment_footprint
? residency_budget_bytes - worst_merged_segment_footprint
: 0;
sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
if (base_plan.available) {
sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes);
sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_for_annotate);
std::vector<ggml_tensor*> resident_params;
uint64_t token = 0;
@ -2851,7 +2872,6 @@ public:
}
}
}
}
free_compute_buffer();
free_cache_ctx_and_buffer();