mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-09 15:56:39 +00:00
perf: keep chunk-K residency engaged with runtime LoRA (#1598)
This commit is contained in:
parent
4513e3fda9
commit
a7f2e03da4
@ -2432,12 +2432,17 @@ protected:
|
||||
GGML_ASSERT(gf != nullptr);
|
||||
|
||||
// Keep the plan and resident params under the same live-VRAM cap.
|
||||
// Add back our own resident buffer so we don't see chunk-K's
|
||||
// allocation as "taken" VRAM and shrink the budget on every step.
|
||||
size_t effective_budget = max_graph_vram_bytes;
|
||||
if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) {
|
||||
ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend);
|
||||
if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
size_t free_vram = 0, total_vram = 0;
|
||||
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
|
||||
if (resident_runtime_params_buffer != nullptr) {
|
||||
free_vram += ggml_backend_buffer_get_size(resident_runtime_params_buffer);
|
||||
}
|
||||
constexpr size_t safety_margin = 512ull * 1024 * 1024;
|
||||
size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
|
||||
if (free_clamp < effective_budget) {
|
||||
@ -2815,14 +2820,30 @@ public:
|
||||
bool no_return = false) {
|
||||
GGML_ASSERT(gf != nullptr);
|
||||
|
||||
// Runtime LoRA mutates CPU weights between calls, so resident GPU
|
||||
// copies would go stale.
|
||||
if (weight_adapter != nullptr) {
|
||||
restore_resident_params();
|
||||
} else {
|
||||
// Runtime LoRA composes `weight + diff` in the compute graph via
|
||||
// ggml_add; the resident weight tensor's data is never mutated, so
|
||||
// chunk-K residency stays valid across sampling steps.
|
||||
// Reserve room for the worst merged segment so chunk-K can't grow
|
||||
// large enough to starve later partial-param allocations.
|
||||
size_t worst_merged_segment_footprint = 0;
|
||||
for (const auto& seg : plan.segments) {
|
||||
const size_t fp = seg.input_param_bytes +
|
||||
seg.compute_buffer_size +
|
||||
seg.output_bytes +
|
||||
seg.input_previous_cut_bytes +
|
||||
seg.input_external_bytes;
|
||||
if (fp > worst_merged_segment_footprint) {
|
||||
worst_merged_segment_footprint = fp;
|
||||
}
|
||||
}
|
||||
const size_t residency_budget_for_annotate =
|
||||
residency_budget_bytes > worst_merged_segment_footprint
|
||||
? residency_budget_bytes - worst_merged_segment_footprint
|
||||
: 0;
|
||||
|
||||
sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
|
||||
if (base_plan.available) {
|
||||
sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes);
|
||||
sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_for_annotate);
|
||||
|
||||
std::vector<ggml_tensor*> resident_params;
|
||||
uint64_t token = 0;
|
||||
@ -2851,7 +2872,6 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free_compute_buffer();
|
||||
free_cache_ctx_and_buffer();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user