From 064001b5240157a9129a98149b057f6ff76b52f4 Mon Sep 17 00:00:00 2001 From: fszontagh <51741446+fszontagh@users.noreply.github.com> Date: Sat, 6 Jun 2026 10:22:18 +0200 Subject: [PATCH] perf: allocate CPU-offloaded params from runtime device pinned host buffer (#1601) --- src/ggml_extend.hpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index fe0fce9f..9dada344 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -3017,7 +3017,18 @@ public: LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str()); return true; } - params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend); + // Pinned host buffer when CPU-offloaded for DMA-direct H2D. + ggml_backend_buffer_type_t params_buft = nullptr; + if (params_backend != runtime_backend) { + ggml_backend_dev_t runtime_dev = ggml_backend_get_device(runtime_backend); + if (runtime_dev != nullptr) { + params_buft = ggml_backend_dev_host_buffer_type(runtime_dev); + } + } + if (params_buft == nullptr) { + params_buft = ggml_backend_get_default_buffer_type(params_backend); + } + params_buffer = ggml_backend_alloc_ctx_tensors_from_buft(params_ctx, params_buft); if (params_buffer == nullptr) { LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i", get_desc().c_str(),