mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-09 15:56:39 +00:00
perf: keep chunk-K residency engaged with runtime LoRA (#1598)
This commit is contained in:
parent
4513e3fda9
commit
a7f2e03da4
@ -2432,12 +2432,17 @@ protected:
|
|||||||
GGML_ASSERT(gf != nullptr);
|
GGML_ASSERT(gf != nullptr);
|
||||||
|
|
||||||
// Keep the plan and resident params under the same live-VRAM cap.
|
// Keep the plan and resident params under the same live-VRAM cap.
|
||||||
|
// Add back our own resident buffer so we don't see chunk-K's
|
||||||
|
// allocation as "taken" VRAM and shrink the budget on every step.
|
||||||
size_t effective_budget = max_graph_vram_bytes;
|
size_t effective_budget = max_graph_vram_bytes;
|
||||||
if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) {
|
if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) {
|
||||||
ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend);
|
ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend);
|
||||||
if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||||
size_t free_vram = 0, total_vram = 0;
|
size_t free_vram = 0, total_vram = 0;
|
||||||
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
|
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
|
||||||
|
if (resident_runtime_params_buffer != nullptr) {
|
||||||
|
free_vram += ggml_backend_buffer_get_size(resident_runtime_params_buffer);
|
||||||
|
}
|
||||||
constexpr size_t safety_margin = 512ull * 1024 * 1024;
|
constexpr size_t safety_margin = 512ull * 1024 * 1024;
|
||||||
size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
|
size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
|
||||||
if (free_clamp < effective_budget) {
|
if (free_clamp < effective_budget) {
|
||||||
@ -2815,39 +2820,54 @@ public:
|
|||||||
bool no_return = false) {
|
bool no_return = false) {
|
||||||
GGML_ASSERT(gf != nullptr);
|
GGML_ASSERT(gf != nullptr);
|
||||||
|
|
||||||
// Runtime LoRA mutates CPU weights between calls, so resident GPU
|
// Runtime LoRA composes `weight + diff` in the compute graph via
|
||||||
// copies would go stale.
|
// ggml_add; the resident weight tensor's data is never mutated, so
|
||||||
if (weight_adapter != nullptr) {
|
// chunk-K residency stays valid across sampling steps.
|
||||||
restore_resident_params();
|
// Reserve room for the worst merged segment so chunk-K can't grow
|
||||||
} else {
|
// large enough to starve later partial-param allocations.
|
||||||
sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
|
size_t worst_merged_segment_footprint = 0;
|
||||||
if (base_plan.available) {
|
for (const auto& seg : plan.segments) {
|
||||||
sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes);
|
const size_t fp = seg.input_param_bytes +
|
||||||
|
seg.compute_buffer_size +
|
||||||
|
seg.output_bytes +
|
||||||
|
seg.input_previous_cut_bytes +
|
||||||
|
seg.input_external_bytes;
|
||||||
|
if (fp > worst_merged_segment_footprint) {
|
||||||
|
worst_merged_segment_footprint = fp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const size_t residency_budget_for_annotate =
|
||||||
|
residency_budget_bytes > worst_merged_segment_footprint
|
||||||
|
? residency_budget_bytes - worst_merged_segment_footprint
|
||||||
|
: 0;
|
||||||
|
|
||||||
std::vector<ggml_tensor*> resident_params;
|
sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
|
||||||
uint64_t token = 0;
|
if (base_plan.available) {
|
||||||
for (const auto& segment : base_plan.segments) {
|
sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_for_annotate);
|
||||||
if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) {
|
|
||||||
continue;
|
std::vector<ggml_tensor*> resident_params;
|
||||||
}
|
uint64_t token = 0;
|
||||||
auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment);
|
for (const auto& segment : base_plan.segments) {
|
||||||
for (ggml_tensor* t : seg_params) {
|
if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) {
|
||||||
if (t == nullptr)
|
continue;
|
||||||
continue;
|
|
||||||
resident_params.push_back(t);
|
|
||||||
token ^= reinterpret_cast<uintptr_t>(t) * 0x9E3779B97F4A7C15ull;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (token != resident_state_token) {
|
auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment);
|
||||||
restore_resident_params();
|
for (ggml_tensor* t : seg_params) {
|
||||||
if (!resident_params.empty()) {
|
if (t == nullptr)
|
||||||
if (offload_resident_params(resident_params)) {
|
continue;
|
||||||
resident_state_token = token;
|
resident_params.push_back(t);
|
||||||
} else {
|
token ^= reinterpret_cast<uintptr_t>(t) * 0x9E3779B97F4A7C15ull;
|
||||||
LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming",
|
}
|
||||||
get_desc().c_str());
|
}
|
||||||
restore_resident_params();
|
if (token != resident_state_token) {
|
||||||
}
|
restore_resident_params();
|
||||||
|
if (!resident_params.empty()) {
|
||||||
|
if (offload_resident_params(resident_params)) {
|
||||||
|
resident_state_token = token;
|
||||||
|
} else {
|
||||||
|
LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming",
|
||||||
|
get_desc().c_str());
|
||||||
|
restore_resident_params();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user