perf: allocate CPU-offloaded params from runtime device pinned host buffer (#1601)

This commit is contained in:
fszontagh 2026-06-06 10:22:18 +02:00 committed by GitHub
parent 1f9ee88e09
commit 064001b524
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -3017,7 +3017,18 @@ public:
LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str()); LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str());
return true; return true;
} }
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend); // Pinned host buffer when CPU-offloaded for DMA-direct H2D.
ggml_backend_buffer_type_t params_buft = nullptr;
if (params_backend != runtime_backend) {
ggml_backend_dev_t runtime_dev = ggml_backend_get_device(runtime_backend);
if (runtime_dev != nullptr) {
params_buft = ggml_backend_dev_host_buffer_type(runtime_dev);
}
}
if (params_buft == nullptr) {
params_buft = ggml_backend_get_default_buffer_type(params_backend);
}
params_buffer = ggml_backend_alloc_ctx_tensors_from_buft(params_ctx, params_buft);
if (params_buffer == nullptr) { if (params_buffer == nullptr) {
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i", LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
get_desc().c_str(), get_desc().c_str(),