mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-24 23:26:43 +00:00
940 lines
36 KiB
C++
940 lines
36 KiB
C++
#include "model_manager.h"
|
|
|
|
#include <algorithm>
|
|
#include <cstdint>
|
|
#include <iterator>
|
|
#include <mutex>
|
|
#include <unordered_set>
|
|
|
|
#include "core/ggml_extend_backend.h"
|
|
#include "core/util.h"
|
|
#include "model/adapter/lora.hpp"
|
|
|
|
static size_t aligned_offset(const void* buffer, size_t offset, size_t alignment) {
|
|
GGML_ASSERT(alignment != 0 && (alignment & (alignment - 1)) == 0);
|
|
size_t align = (alignment - ((reinterpret_cast<uintptr_t>(buffer) + offset) % alignment)) % alignment;
|
|
return offset + align;
|
|
}
|
|
|
|
static bool lora_specs_equal(const std::vector<ModelManager::LoraSpec>& lhs,
|
|
const std::vector<ModelManager::LoraSpec>& rhs) {
|
|
if (lhs.size() != rhs.size()) {
|
|
return false;
|
|
}
|
|
for (size_t i = 0; i < lhs.size(); ++i) {
|
|
if (lhs[i].path != rhs[i].path ||
|
|
lhs[i].multiplier != rhs[i].multiplier ||
|
|
lhs[i].is_high_noise != rhs[i].is_high_noise ||
|
|
lhs[i].tensor_name_prefix_filter != rhs[i].tensor_name_prefix_filter ||
|
|
lhs[i].required != rhs[i].required) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static std::string lora_id(const ModelManager::LoraSpec& lora) {
|
|
return lora.is_high_noise ? "|high_noise|" + lora.path : lora.path;
|
|
}
|
|
|
|
static bool backend_supports_host_buffer(ggml_backend_t backend) {
|
|
if (backend == nullptr) {
|
|
return false;
|
|
}
|
|
if (sd_backend_is_cpu(backend)) {
|
|
return true;
|
|
}
|
|
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
|
|
if (dev == nullptr) {
|
|
return false;
|
|
}
|
|
ggml_backend_dev_props props;
|
|
ggml_backend_dev_get_props(dev, &props);
|
|
return props.caps.buffer_from_host_ptr;
|
|
}
|
|
|
|
ModelManager::~ModelManager() {
|
|
release_all();
|
|
}
|
|
|
|
void ModelManager::set_common_ignore_tensors(std::set<std::string> ignore_tensors) {
|
|
common_ignore_tensors_ = std::move(ignore_tensors);
|
|
}
|
|
|
|
void ModelManager::set_loras(std::vector<LoraSpec> loras, SDVersion version) {
|
|
if (loras.empty() && loras_.empty()) {
|
|
lora_version_ = version;
|
|
return;
|
|
}
|
|
if (lora_version_ == version && lora_specs_equal(loras_, loras)) {
|
|
return;
|
|
}
|
|
|
|
loras_ = std::move(loras);
|
|
lora_version_ = version;
|
|
current_lora_epoch_++;
|
|
reset_lora_applied_params();
|
|
}
|
|
|
|
std::set<std::string> ModelManager::tensor_names() const {
|
|
std::set<std::string> names;
|
|
for (const auto& state : tensor_states_) {
|
|
if (state != nullptr) {
|
|
names.insert(state->name);
|
|
}
|
|
}
|
|
return names;
|
|
}
|
|
|
|
size_t estimate_tensors_size(const std::map<std::string, ggml_tensor*>& tensors) {
|
|
size_t size = 0;
|
|
std::unordered_set<ggml_tensor*> seen;
|
|
for (const auto& pair : tensors) {
|
|
ggml_tensor* tensor = pair.second;
|
|
if (tensor == nullptr || seen.find(tensor) != seen.end()) {
|
|
continue;
|
|
}
|
|
seen.insert(tensor);
|
|
size += ggml_nbytes(tensor);
|
|
}
|
|
return size;
|
|
}
|
|
|
|
bool ModelManager::register_param_tensors(const std::string& desc,
|
|
std::map<std::string, ggml_tensor*> tensors,
|
|
ResidencyMode residency_mode,
|
|
ggml_backend_t compute_backend,
|
|
ggml_backend_t params_backend,
|
|
size_t* registered_tensor_size) {
|
|
if (desc.empty()) {
|
|
LOG_ERROR("model manager tensor desc is empty");
|
|
return false;
|
|
}
|
|
if (registered_tensor_size != nullptr) {
|
|
*registered_tensor_size += estimate_tensors_size(tensors);
|
|
}
|
|
|
|
std::vector<std::unique_ptr<TensorState>> new_states;
|
|
new_states.reserve(tensors.size());
|
|
|
|
for (const auto& pair : tensors) {
|
|
const std::string& name = pair.first;
|
|
ggml_tensor* tensor = pair.second;
|
|
if (tensor == nullptr) {
|
|
continue;
|
|
}
|
|
if (tensor_states_by_name_.find(name) != tensor_states_by_name_.end()) {
|
|
LOG_ERROR("model manager tensor name '%s' is already registered", name.c_str());
|
|
return false;
|
|
}
|
|
ggml_set_name(tensor, name.c_str());
|
|
|
|
auto state = std::make_unique<TensorState>();
|
|
state->name = name;
|
|
state->tensor = tensor;
|
|
state->desc = desc;
|
|
state->residency_mode = residency_mode;
|
|
state->compute_backend = compute_backend;
|
|
state->params_backend = params_backend;
|
|
new_states.push_back(std::move(state));
|
|
}
|
|
|
|
for (auto& state : new_states) {
|
|
TensorState* registered_state = state.get();
|
|
tensor_states_by_name_[registered_state->name] = registered_state;
|
|
tensor_states_.push_back(std::move(state));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool ModelManager::validate_registered_tensors() {
|
|
bool ok = true;
|
|
for (const auto& state : tensor_states_) {
|
|
if (state == nullptr) {
|
|
ok = false;
|
|
continue;
|
|
}
|
|
bool state_ok = validate_tensor(*state);
|
|
if (state_ok) {
|
|
state->metadata_validated = true;
|
|
}
|
|
ok = state_ok && ok;
|
|
}
|
|
return ok;
|
|
}
|
|
|
|
bool ModelManager::load_tensors_to_params_backend(const std::vector<TensorState*>& states) {
|
|
std::vector<TensorState*> need_load;
|
|
need_load.reserve(states.size());
|
|
for (TensorState* state : states) {
|
|
if (state == nullptr || should_ignore(*state) || is_optional_missing_tensor(state->name)) {
|
|
continue;
|
|
}
|
|
if (!state->metadata_validated) {
|
|
if (!validate_tensor(*state)) {
|
|
return false;
|
|
}
|
|
state->metadata_validated = true;
|
|
}
|
|
if (!state->loaded_to_params_backend) {
|
|
need_load.push_back(state);
|
|
}
|
|
}
|
|
if (need_load.empty()) {
|
|
return true;
|
|
}
|
|
|
|
std::vector<ParamsStorageBlock*> created_storage_blocks;
|
|
if (!mmap_params(need_load, created_storage_blocks)) {
|
|
for (ParamsStorageBlock* block : created_storage_blocks) {
|
|
if (block != nullptr) {
|
|
free_params_storage_block(*block);
|
|
erase_params_storage_block(block);
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
std::vector<TensorState*> need_alloc;
|
|
need_alloc.reserve(need_load.size());
|
|
for (TensorState* state : need_load) {
|
|
if (state->tensor != nullptr && state->tensor->data == nullptr && state->tensor->view_src == nullptr) {
|
|
need_alloc.push_back(state);
|
|
}
|
|
}
|
|
|
|
if (!alloc_params_buffers(need_alloc, created_storage_blocks) ||
|
|
!load_tensors(need_load)) {
|
|
for (ParamsStorageBlock* block : created_storage_blocks) {
|
|
if (block != nullptr) {
|
|
free_params_storage_block(*block);
|
|
erase_params_storage_block(block);
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
for (ParamsStorageBlock* block : created_storage_blocks) {
|
|
if (block != nullptr && block->buffer != nullptr) {
|
|
LOG_DEBUG("model manager prepared params backend buffer (%6.2f MB, %zu tensors, %s)",
|
|
ggml_backend_buffer_get_size(block->buffer) / (1024.f * 1024.f),
|
|
block->states.size(),
|
|
ggml_backend_buffer_is_host(block->buffer) ? "RAM" : "VRAM");
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool ModelManager::stage_tensors_to_compute_backend(const std::vector<TensorState*>& states) {
|
|
std::map<ggml_backend_t, std::vector<TensorState*>> states_by_compute_backend;
|
|
for (TensorState* state : states) {
|
|
if (state == nullptr || should_ignore(*state) || is_optional_missing_tensor(state->name)) {
|
|
continue;
|
|
}
|
|
if (state->compute_backend == nullptr) {
|
|
LOG_ERROR("model manager compute backend is null for tensor '%s'", state->name.c_str());
|
|
return false;
|
|
}
|
|
if (state->params_backend == nullptr) {
|
|
LOG_ERROR("model manager params backend is null for tensor '%s'", state->name.c_str());
|
|
return false;
|
|
}
|
|
if (state->compute_backend == state->params_backend || state->staged_to_compute_backend) {
|
|
continue;
|
|
}
|
|
if (!state->loaded_to_params_backend || state->tensor == nullptr || state->tensor->data == nullptr) {
|
|
LOG_ERROR("model manager tensor '%s' is not loaded to params backend", state->name.c_str());
|
|
return false;
|
|
}
|
|
states_by_compute_backend[state->compute_backend].push_back(state);
|
|
}
|
|
|
|
for (const auto& pair : states_by_compute_backend) {
|
|
ggml_backend_t compute_backend = pair.first;
|
|
const std::vector<TensorState*>& states = pair.second;
|
|
if (states.empty()) {
|
|
continue;
|
|
}
|
|
|
|
int64_t t0 = ggml_time_ms();
|
|
|
|
ggml_init_params init_params;
|
|
init_params.mem_size = std::max<size_t>(1, states.size()) * ggml_tensor_overhead();
|
|
init_params.mem_buffer = nullptr;
|
|
init_params.no_alloc = true;
|
|
|
|
ggml_context* staging_ctx = ggml_init(init_params);
|
|
GGML_ASSERT(staging_ctx != nullptr);
|
|
|
|
std::vector<std::pair<TensorState*, ggml_tensor*>> staged_tensors;
|
|
staged_tensors.reserve(states.size());
|
|
for (TensorState* state : states) {
|
|
ggml_tensor* staging_tensor = ggml_dup_tensor(staging_ctx, state->tensor);
|
|
ggml_set_name(staging_tensor, state->tensor->name);
|
|
staged_tensors.push_back({state, staging_tensor});
|
|
}
|
|
|
|
ggml_backend_buffer_t compute_buffer = ggml_backend_alloc_ctx_tensors(staging_ctx, compute_backend);
|
|
if (compute_buffer == nullptr) {
|
|
LOG_ERROR("model manager alloc compute params backend buffer failed, num_tensors = %zu",
|
|
staged_tensors.size());
|
|
ggml_free(staging_ctx);
|
|
return false;
|
|
}
|
|
ggml_backend_buffer_set_usage(compute_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
|
|
for (auto& staged_tensor : staged_tensors) {
|
|
TensorState* state = staged_tensor.first;
|
|
ggml_tensor* managed_tensor = state->tensor;
|
|
ggml_tensor* staging_tensor = staged_tensor.second;
|
|
ggml_backend_tensor_copy(managed_tensor, staging_tensor);
|
|
std::swap(managed_tensor->buffer, staging_tensor->buffer);
|
|
std::swap(managed_tensor->data, staging_tensor->data);
|
|
std::swap(managed_tensor->extra, staging_tensor->extra);
|
|
}
|
|
ggml_backend_synchronize(compute_backend);
|
|
|
|
auto block = std::make_unique<ComputeStagingBlock>();
|
|
block->compute_backend = compute_backend;
|
|
block->buffer = compute_buffer;
|
|
block->staging_ctx = staging_ctx;
|
|
block->staged_tensors = std::move(staged_tensors);
|
|
for (auto& staged_tensor : block->staged_tensors) {
|
|
TensorState* state = staged_tensor.first;
|
|
state->staged_to_compute_backend = true;
|
|
}
|
|
compute_staging_blocks_.push_back(std::move(block));
|
|
|
|
int64_t t1 = ggml_time_ms();
|
|
LOG_DEBUG("model manager staged compute params (%6.2f MB, %zu tensors) to %s, taking %.2fs",
|
|
ggml_backend_buffer_get_size(compute_buffer) / (1024.f * 1024.f),
|
|
states.size(),
|
|
ggml_backend_name(compute_backend),
|
|
(t1 - t0) * 1.0f / 1000);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool ModelManager::apply_loras_to_params(const std::vector<TensorState*>& states) {
|
|
if (loras_.empty()) {
|
|
return true;
|
|
}
|
|
|
|
struct LoraApplyGroup {
|
|
std::map<std::string, ggml_tensor*> model_tensors;
|
|
std::vector<TensorState*> states;
|
|
};
|
|
|
|
std::map<ggml_backend_t, LoraApplyGroup> groups;
|
|
for (TensorState* state : states) {
|
|
if (state == nullptr || state->tensor == nullptr ||
|
|
should_ignore(*state) || is_optional_missing_tensor(state->name)) {
|
|
continue;
|
|
}
|
|
if (state->applied_lora_epoch == current_lora_epoch_) {
|
|
continue;
|
|
}
|
|
if (state->compute_backend == nullptr) {
|
|
LOG_ERROR("model manager compute backend is null for lora target tensor '%s'", state->name.c_str());
|
|
return false;
|
|
}
|
|
if (state->tensor->data == nullptr) {
|
|
LOG_ERROR("model manager lora target tensor '%s' is not prepared", state->name.c_str());
|
|
return false;
|
|
}
|
|
LoraApplyGroup& group = groups[state->compute_backend];
|
|
group.model_tensors[state->name] = state->tensor;
|
|
group.states.push_back(state);
|
|
}
|
|
|
|
if (groups.empty()) {
|
|
return true;
|
|
}
|
|
|
|
std::set<std::string> all_tensor_names = tensor_names();
|
|
for (auto& group_pair : groups) {
|
|
ggml_backend_t compute_backend = group_pair.first;
|
|
LoraApplyGroup& group = group_pair.second;
|
|
for (const LoraSpec& lora_spec : loras_) {
|
|
if (group.model_tensors.empty()) {
|
|
continue;
|
|
}
|
|
|
|
std::string id = lora_id(lora_spec);
|
|
auto lora = std::make_shared<LoraModel>(id,
|
|
compute_backend,
|
|
compute_backend,
|
|
lora_spec.path,
|
|
lora_spec.is_high_noise ? "model.high_noise_" : "",
|
|
lora_version_);
|
|
|
|
LoraModel::filter_t lora_tensor_filter = nullptr;
|
|
if (!lora_spec.tensor_name_prefix_filter.empty()) {
|
|
lora_tensor_filter = [&](const std::string& tensor_name) {
|
|
return starts_with(tensor_name, lora_spec.tensor_name_prefix_filter);
|
|
};
|
|
}
|
|
if (!lora->load_from_file(n_threads_, lora_tensor_filter)) {
|
|
LOG_WARN("load lora tensors from %s failed", lora_spec.path.c_str());
|
|
if (lora_spec.required) {
|
|
return false;
|
|
}
|
|
continue;
|
|
}
|
|
if (lora->lora_tensors.empty()) {
|
|
if (lora_spec.required) {
|
|
LOG_ERROR("required lora has no tensors: %s", lora_spec.path.c_str());
|
|
return false;
|
|
}
|
|
continue;
|
|
}
|
|
lora->multiplier = lora_spec.multiplier;
|
|
lora->apply(group.model_tensors, all_tensor_names, lora_version_, n_threads_, false);
|
|
lora->release_loaded_tensors();
|
|
}
|
|
|
|
for (TensorState* state : group.states) {
|
|
if (state != nullptr) {
|
|
state->applied_lora_epoch = current_lora_epoch_;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void ModelManager::reset_lora_applied_params() {
|
|
release_compute_staging_blocks(true);
|
|
release_params_storage_blocks(true);
|
|
for (auto& state : tensor_states_) {
|
|
state->applied_lora_epoch = UINT64_MAX;
|
|
}
|
|
}
|
|
|
|
bool ModelManager::should_ignore(const TensorState& state) const {
|
|
for (const auto& ignore_prefix : common_ignore_tensors_) {
|
|
if (starts_with(state.name, ignore_prefix)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool ModelManager::is_optional_missing_tensor(const std::string& name) const {
|
|
return name.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos ||
|
|
name.find("alphas_cumprod") != std::string::npos;
|
|
}
|
|
|
|
bool ModelManager::validate_tensor(const TensorState& state) const {
|
|
if (state.tensor == nullptr || should_ignore(state) || is_optional_missing_tensor(state.name)) {
|
|
return true;
|
|
}
|
|
|
|
const auto& tensor_storage_map = model_loader_.get_tensor_storage_map();
|
|
auto ts_it = tensor_storage_map.find(state.name);
|
|
if (ts_it == tensor_storage_map.end()) {
|
|
LOG_ERROR("%s tensor '%s' not in model metadata", state.desc.c_str(), state.name.c_str());
|
|
return false;
|
|
}
|
|
|
|
const TensorStorage& tensor_storage = ts_it->second;
|
|
if (state.tensor->ne[0] != tensor_storage.ne[0] ||
|
|
state.tensor->ne[1] != tensor_storage.ne[1] ||
|
|
state.tensor->ne[2] != tensor_storage.ne[2] ||
|
|
state.tensor->ne[3] != tensor_storage.ne[3]) {
|
|
LOG_ERROR(
|
|
"%s tensor '%s' has wrong shape in model metadata: got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
|
|
state.desc.c_str(),
|
|
state.name.c_str(),
|
|
(int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3],
|
|
(int)state.tensor->ne[0], (int)state.tensor->ne[1], (int)state.tensor->ne[2], (int)state.tensor->ne[3]);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool ModelManager::mmap_params(const std::vector<TensorState*>& states,
|
|
std::vector<ParamsStorageBlock*>& created_storage_blocks) {
|
|
std::map<std::string, ggml_tensor*> mmap_candidates;
|
|
std::map<std::string, TensorState*> mmap_states;
|
|
for (TensorState* state : states) {
|
|
if (state == nullptr || !can_mmap_storage(*state) || state->tensor == nullptr ||
|
|
state->tensor->data != nullptr || state->tensor->view_src != nullptr) {
|
|
continue;
|
|
}
|
|
mmap_candidates[state->name] = state->tensor;
|
|
mmap_states[state->name] = state;
|
|
}
|
|
if (mmap_candidates.empty()) {
|
|
return true;
|
|
}
|
|
|
|
auto mmap_store = model_loader_.mmap_tensors(mmap_candidates, {}, true);
|
|
if (mmap_store.empty()) {
|
|
return true;
|
|
}
|
|
|
|
auto block = std::make_unique<ParamsStorageBlock>();
|
|
block->mmap_tensor_stores = std::move(mmap_store);
|
|
ParamsStorageBlock* raw = block.get();
|
|
for (const auto& pair : mmap_states) {
|
|
TensorState* state = pair.second;
|
|
if (state != nullptr && state->tensor != nullptr && state->tensor->data != nullptr) {
|
|
block->states.push_back(state);
|
|
}
|
|
}
|
|
|
|
if (!block->states.empty()) {
|
|
params_storage_blocks_.push_back(std::move(block));
|
|
created_storage_blocks.push_back(raw);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool ModelManager::can_mmap_storage(const TensorState& state) const {
|
|
if (!enable_mmap_ || state.residency_mode != ResidencyMode::ParamBackend) {
|
|
return false;
|
|
}
|
|
if (state.compute_backend == nullptr || state.params_backend == nullptr) {
|
|
return false;
|
|
}
|
|
return sd_backend_is_cpu(state.compute_backend) ||
|
|
sd_backend_is_cpu(state.params_backend) ||
|
|
backend_supports_host_buffer(state.compute_backend);
|
|
}
|
|
|
|
bool ModelManager::alloc_params_buffers(const std::vector<TensorState*>& states,
|
|
std::vector<ParamsStorageBlock*>& created_storage_blocks) {
|
|
std::map<std::pair<ggml_backend_buffer_type_t, int>, std::vector<TensorState*>> states_by_buffer_type;
|
|
for (TensorState* state : states) {
|
|
if (state == nullptr || state->tensor == nullptr) {
|
|
continue;
|
|
}
|
|
ggml_backend_buffer_type_t params_buft = params_buffer_type_for(*state);
|
|
if (params_buft == nullptr) {
|
|
return false;
|
|
}
|
|
states_by_buffer_type[{params_buft, static_cast<int>(state->residency_mode)}].push_back(state);
|
|
}
|
|
|
|
for (const auto& pair : states_by_buffer_type) {
|
|
ggml_backend_buffer_type_t params_buft = pair.first.first;
|
|
const std::vector<TensorState*>& states = pair.second;
|
|
size_t alignment = ggml_backend_buft_get_alignment(params_buft);
|
|
size_t max_size = ggml_backend_buft_get_max_size(params_buft);
|
|
|
|
auto alloc_chunk = [&](const std::vector<TensorState*>& chunk, size_t chunk_size) -> bool {
|
|
if (chunk.empty() || chunk_size == 0) {
|
|
return true;
|
|
}
|
|
|
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(params_buft, chunk_size);
|
|
if (buffer == nullptr) {
|
|
LOG_ERROR("model manager alloc params backend buffer failed, size = %.2fMB",
|
|
chunk_size / (1024.0 * 1024.0));
|
|
return false;
|
|
}
|
|
ggml_backend_buffer_set_usage(buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
|
|
std::vector<ggml_tensor*> initialized_tensors;
|
|
void* base = ggml_backend_buffer_get_base(buffer);
|
|
size_t offset = aligned_offset(base, 0, ggml_backend_buffer_get_alignment(buffer));
|
|
for (TensorState* state : chunk) {
|
|
ggml_tensor* tensor = state->tensor;
|
|
size_t tensor_size = GGML_PAD(ggml_backend_buffer_get_alloc_size(buffer, tensor),
|
|
ggml_backend_buffer_get_alignment(buffer));
|
|
enum ggml_status status = ggml_backend_tensor_alloc(buffer, tensor, static_cast<char*>(base) + offset);
|
|
if (status != GGML_STATUS_SUCCESS) {
|
|
LOG_ERROR("model manager failed to initialize params tensor '%s'", ggml_get_name(tensor));
|
|
for (ggml_tensor* initialized : initialized_tensors) {
|
|
initialized->buffer = nullptr;
|
|
initialized->data = nullptr;
|
|
initialized->extra = nullptr;
|
|
}
|
|
LOG_DEBUG("model manager releasing params backend buffer (%6.2f MB, %zu tensors, %s)",
|
|
ggml_backend_buffer_get_size(buffer) / (1024.f * 1024.f),
|
|
initialized_tensors.size(),
|
|
ggml_backend_buffer_is_host(buffer) ? "RAM" : "VRAM");
|
|
ggml_backend_buffer_free(buffer);
|
|
return false;
|
|
}
|
|
initialized_tensors.push_back(tensor);
|
|
offset += tensor_size;
|
|
}
|
|
|
|
auto block = std::make_unique<ParamsStorageBlock>();
|
|
block->buffer = buffer;
|
|
block->states = chunk;
|
|
ParamsStorageBlock* raw = block.get();
|
|
params_storage_blocks_.push_back(std::move(block));
|
|
created_storage_blocks.push_back(raw);
|
|
|
|
return true;
|
|
};
|
|
|
|
std::vector<TensorState*> chunk;
|
|
size_t chunk_size = 0;
|
|
for (TensorState* state : states) {
|
|
ggml_tensor* tensor = state->tensor;
|
|
size_t tensor_size = GGML_PAD(ggml_backend_buft_get_alloc_size(params_buft, tensor), alignment);
|
|
// Some backends, e.g. Vulkan, report a preferred chunk size here rather than a
|
|
// hard per-tensor allocation limit. Oversized tensors are allocated alone.
|
|
if (!chunk.empty() && max_size > 0 && chunk_size + tensor_size > max_size) {
|
|
if (!alloc_chunk(chunk, chunk_size)) {
|
|
return false;
|
|
}
|
|
chunk.clear();
|
|
chunk_size = 0;
|
|
}
|
|
chunk.push_back(state);
|
|
chunk_size += tensor_size;
|
|
}
|
|
|
|
if (!alloc_chunk(chunk, chunk_size)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool ModelManager::load_tensors(const std::vector<TensorState*>& states) {
|
|
std::map<std::string, TensorState*> states_by_name;
|
|
std::set<std::string> target_tensor_names;
|
|
for (TensorState* state : states) {
|
|
if (state == nullptr) {
|
|
continue;
|
|
}
|
|
states_by_name[state->name] = state;
|
|
target_tensor_names.insert(state->name);
|
|
}
|
|
if (states_by_name.empty()) {
|
|
return true;
|
|
}
|
|
|
|
std::set<std::string> loaded_names;
|
|
std::mutex loaded_names_mutex;
|
|
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
|
|
const std::string& name = tensor_storage.name;
|
|
*dst_tensor = nullptr;
|
|
|
|
auto state_it = states_by_name.find(name);
|
|
if (state_it == states_by_name.end()) {
|
|
return true;
|
|
}
|
|
|
|
TensorState* state = state_it->second;
|
|
if (state == nullptr || state->tensor == nullptr) {
|
|
LOG_ERROR("model manager tensor '%s' is null", name.c_str());
|
|
return false;
|
|
}
|
|
|
|
if (state->tensor->ne[0] != tensor_storage.ne[0] ||
|
|
state->tensor->ne[1] != tensor_storage.ne[1] ||
|
|
state->tensor->ne[2] != tensor_storage.ne[2] ||
|
|
state->tensor->ne[3] != tensor_storage.ne[3]) {
|
|
LOG_ERROR(
|
|
"model manager tensor '%s' has wrong shape in model file: got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
|
|
name.c_str(),
|
|
(int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3],
|
|
(int)state->tensor->ne[0], (int)state->tensor->ne[1], (int)state->tensor->ne[2], (int)state->tensor->ne[3]);
|
|
return false;
|
|
}
|
|
|
|
{
|
|
std::lock_guard<std::mutex> lock(loaded_names_mutex);
|
|
loaded_names.insert(name);
|
|
}
|
|
*dst_tensor = state->tensor;
|
|
return true;
|
|
};
|
|
|
|
if (!model_loader_.load_tensors(on_new_tensor_cb, enable_mmap_, &target_tensor_names)) {
|
|
LOG_ERROR("model manager load tensors failed");
|
|
return false;
|
|
}
|
|
|
|
bool missing = false;
|
|
for (const auto& pair : states_by_name) {
|
|
const std::string& name = pair.first;
|
|
if (loaded_names.find(name) == loaded_names.end()) {
|
|
LOG_ERROR("model manager tensor '%s' was not loaded", name.c_str());
|
|
missing = true;
|
|
}
|
|
}
|
|
if (missing) {
|
|
return false;
|
|
}
|
|
|
|
for (const auto& pair : states_by_name) {
|
|
pair.second->loaded_to_params_backend = true;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
ggml_backend_buffer_type_t ModelManager::params_buffer_type_for(const TensorState& state) const {
|
|
if (state.params_backend == nullptr) {
|
|
LOG_ERROR("model manager params backend is null for tensor '%s'", state.name.c_str());
|
|
return nullptr;
|
|
}
|
|
ggml_backend_buffer_type_t params_buft = nullptr;
|
|
if (state.compute_backend != nullptr && state.params_backend != state.compute_backend) {
|
|
ggml_backend_dev_t compute_dev = ggml_backend_get_device(state.compute_backend);
|
|
if (compute_dev != nullptr) {
|
|
params_buft = ggml_backend_dev_host_buffer_type(compute_dev);
|
|
}
|
|
}
|
|
if (params_buft == nullptr) {
|
|
params_buft = ggml_backend_get_default_buffer_type(state.params_backend);
|
|
}
|
|
return params_buft;
|
|
}
|
|
|
|
void ModelManager::free_compute_staging_block(ComputeStagingBlock& block) {
|
|
for (auto& staged_tensor : block.staged_tensors) {
|
|
TensorState* state = staged_tensor.first;
|
|
ggml_tensor* staging_tensor = staged_tensor.second;
|
|
if (state == nullptr || state->tensor == nullptr || staging_tensor == nullptr) {
|
|
continue;
|
|
}
|
|
ggml_tensor* managed_tensor = state->tensor;
|
|
managed_tensor->buffer = staging_tensor->buffer;
|
|
managed_tensor->data = staging_tensor->data;
|
|
managed_tensor->extra = staging_tensor->extra;
|
|
staging_tensor->buffer = nullptr;
|
|
staging_tensor->data = nullptr;
|
|
staging_tensor->extra = nullptr;
|
|
|
|
state->staged_to_compute_backend = false;
|
|
state->applied_lora_epoch = UINT64_MAX;
|
|
}
|
|
|
|
if (block.buffer != nullptr) {
|
|
LOG_DEBUG("model manager releasing compute params (%6.2f MB, %zu tensors) from %s",
|
|
ggml_backend_buffer_get_size(block.buffer) / (1024.f * 1024.f),
|
|
block.staged_tensors.size(),
|
|
block.compute_backend != nullptr ? ggml_backend_name(block.compute_backend) : "unknown");
|
|
ggml_backend_buffer_free(block.buffer);
|
|
block.buffer = nullptr;
|
|
}
|
|
if (block.staging_ctx != nullptr) {
|
|
ggml_free(block.staging_ctx);
|
|
block.staging_ctx = nullptr;
|
|
}
|
|
block.staged_tensors.clear();
|
|
}
|
|
|
|
void ModelManager::release_compute_staging_blocks(bool force,
|
|
const std::unordered_set<TensorState*>* target_states) {
|
|
for (auto it = compute_staging_blocks_.begin(); it != compute_staging_blocks_.end();) {
|
|
ComputeStagingBlock* block = it->get();
|
|
bool can_release = force;
|
|
if (!can_release) {
|
|
can_release = std::all_of(block->staged_tensors.begin(),
|
|
block->staged_tensors.end(),
|
|
[target_states](const std::pair<TensorState*, ggml_tensor*>& pair) {
|
|
TensorState* state = pair.first;
|
|
if (state == nullptr) {
|
|
return true;
|
|
}
|
|
if (target_states != nullptr &&
|
|
target_states->find(state) == target_states->end()) {
|
|
return false;
|
|
}
|
|
return state->active_prepare_count == 0;
|
|
});
|
|
}
|
|
|
|
if (can_release) {
|
|
free_compute_staging_block(*block);
|
|
it = compute_staging_blocks_.erase(it);
|
|
} else {
|
|
++it;
|
|
}
|
|
}
|
|
}
|
|
|
|
void ModelManager::free_params_storage_block(ParamsStorageBlock& block) {
|
|
if (block.buffer != nullptr) {
|
|
LOG_DEBUG("model manager releasing params backend buffer (%6.2f MB, %zu tensors, %s)",
|
|
ggml_backend_buffer_get_size(block.buffer) / (1024.f * 1024.f),
|
|
block.states.size(),
|
|
ggml_backend_buffer_is_host(block.buffer) ? "RAM" : "VRAM");
|
|
ggml_backend_buffer_free(block.buffer);
|
|
block.buffer = nullptr;
|
|
}
|
|
block.mmap_tensor_stores.clear();
|
|
|
|
for (TensorState* state : block.states) {
|
|
if (state == nullptr || state->tensor == nullptr) {
|
|
continue;
|
|
}
|
|
state->tensor->buffer = nullptr;
|
|
state->tensor->data = nullptr;
|
|
state->tensor->extra = nullptr;
|
|
|
|
state->loaded_to_params_backend = false;
|
|
state->applied_lora_epoch = UINT64_MAX;
|
|
}
|
|
block.states.clear();
|
|
}
|
|
|
|
void ModelManager::release_params_storage_blocks(bool force,
|
|
const std::unordered_set<TensorState*>* target_states) {
|
|
for (auto it = params_storage_blocks_.begin(); it != params_storage_blocks_.end();) {
|
|
ParamsStorageBlock* block = it->get();
|
|
bool can_release = force;
|
|
if (!can_release) {
|
|
can_release = std::all_of(block->states.begin(),
|
|
block->states.end(),
|
|
[target_states](TensorState* state) {
|
|
if (state == nullptr) {
|
|
return true;
|
|
}
|
|
if (target_states != nullptr &&
|
|
target_states->find(state) == target_states->end()) {
|
|
return false;
|
|
}
|
|
return state->active_prepare_count == 0 &&
|
|
!state->staged_to_compute_backend &&
|
|
state->residency_mode == ResidencyMode::Disk;
|
|
});
|
|
}
|
|
|
|
if (can_release) {
|
|
free_params_storage_block(*block);
|
|
it = params_storage_blocks_.erase(it);
|
|
} else {
|
|
++it;
|
|
}
|
|
}
|
|
}
|
|
|
|
void ModelManager::erase_params_storage_block(ParamsStorageBlock* block) {
|
|
auto it = std::find_if(params_storage_blocks_.begin(),
|
|
params_storage_blocks_.end(),
|
|
[block](const std::unique_ptr<ParamsStorageBlock>& item) {
|
|
return item.get() == block;
|
|
});
|
|
if (it != params_storage_blocks_.end()) {
|
|
params_storage_blocks_.erase(it);
|
|
}
|
|
}
|
|
|
|
void ModelManager::release_all() {
|
|
for (auto& state : tensor_states_) {
|
|
state->active_prepare_count = 0;
|
|
state->applied_lora_epoch = UINT64_MAX;
|
|
}
|
|
release_compute_staging_blocks(true);
|
|
release_params_storage_blocks(true);
|
|
}
|
|
|
|
bool ModelManager::resolve_required_tensor_states(const std::vector<ggml_tensor*>& tensors,
|
|
std::vector<TensorState*>& required_states) const {
|
|
required_states.clear();
|
|
std::unordered_set<TensorState*> seen;
|
|
for (ggml_tensor* tensor : tensors) {
|
|
if (tensor == nullptr) {
|
|
continue;
|
|
}
|
|
const char* raw_name = ggml_get_name(tensor);
|
|
if (raw_name == nullptr || raw_name[0] == '\0') {
|
|
LOG_ERROR("model manager unnamed tensor is not registered");
|
|
return false;
|
|
}
|
|
auto state_it = tensor_states_by_name_.find(raw_name);
|
|
if (state_it == tensor_states_by_name_.end()) {
|
|
LOG_ERROR("model manager tensor '%s' is not registered", raw_name);
|
|
return false;
|
|
}
|
|
TensorState* state = state_it->second;
|
|
if (state == nullptr) {
|
|
LOG_ERROR("model manager tensor '%s' has no tensor state", raw_name);
|
|
return false;
|
|
}
|
|
if (seen.insert(state).second) {
|
|
required_states.push_back(state);
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool ModelManager::prepare_params(const std::vector<ggml_tensor*>& tensors) {
|
|
if (tensors.empty()) {
|
|
return true;
|
|
}
|
|
|
|
std::vector<TensorState*> required_states;
|
|
if (!resolve_required_tensor_states(tensors, required_states)) {
|
|
return false;
|
|
}
|
|
|
|
if (!load_tensors_to_params_backend(required_states)) {
|
|
return false;
|
|
}
|
|
|
|
if (!stage_tensors_to_compute_backend(required_states)) {
|
|
release_compute_staging_blocks(false);
|
|
release_params_storage_blocks(false);
|
|
return false;
|
|
}
|
|
|
|
if (!apply_loras_to_params(required_states)) {
|
|
release_compute_staging_blocks(false);
|
|
release_params_storage_blocks(false);
|
|
return false;
|
|
}
|
|
|
|
for (TensorState* state : required_states) {
|
|
if (state == nullptr) {
|
|
continue;
|
|
}
|
|
state->active_prepare_count++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void ModelManager::finish_compute_backend_usage(const std::vector<TensorState*>& states) {
|
|
if (states.empty()) {
|
|
return;
|
|
}
|
|
|
|
std::unordered_set<TensorState*> target_states;
|
|
for (TensorState* state : states) {
|
|
if (state == nullptr || !target_states.insert(state).second) {
|
|
continue;
|
|
}
|
|
if (state->active_prepare_count > 0) {
|
|
state->active_prepare_count--;
|
|
}
|
|
}
|
|
release_compute_staging_blocks(false, &target_states);
|
|
}
|
|
|
|
void ModelManager::release_compute_backend_params(const std::vector<ggml_tensor*>& tensors) {
|
|
if (tensors.empty()) {
|
|
return;
|
|
}
|
|
std::vector<TensorState*> required_states;
|
|
if (!resolve_required_tensor_states(tensors, required_states)) {
|
|
return;
|
|
}
|
|
finish_compute_backend_usage(required_states);
|
|
}
|
|
|
|
void ModelManager::release_params_backend_params(const std::vector<ggml_tensor*>& tensors) {
|
|
if (tensors.empty()) {
|
|
return;
|
|
}
|
|
std::vector<TensorState*> required_states;
|
|
if (!resolve_required_tensor_states(tensors, required_states)) {
|
|
return;
|
|
}
|
|
if (required_states.empty()) {
|
|
return;
|
|
}
|
|
std::unordered_set<TensorState*> target_states(required_states.begin(), required_states.end());
|
|
release_params_storage_blocks(false, &target_states);
|
|
}
|