mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-24 23:26:43 +00:00
Compare commits
No commits in common. "2d40a8b2adcdf8b5b0ca0535f3bb7801b6ba13e5" and "7948df8ac1070f5f6881b8d34675821893eb97d6" have entirely different histories.
2d40a8b2ad
...
7948df8ac1
13
Dockerfile
13
Dockerfile
@ -2,18 +2,7 @@ ARG UBUNTU_VERSION=24.04
|
|||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
# sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
|
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake ca-certificates curl gnupg && \
|
|
||||||
mkdir -p /etc/apt/keyrings && \
|
|
||||||
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
|
|
||||||
gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
|
|
||||||
rm /tmp/nodesource-repo.gpg.key && \
|
|
||||||
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
|
|
||||||
apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends nodejs && \
|
|
||||||
npm install -g pnpm@10.15.1 && \
|
|
||||||
apt-get clean && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
WORKDIR /sd.cpp
|
WORKDIR /sd.cpp
|
||||||
|
|
||||||
|
|||||||
@ -3,18 +3,7 @@ ARG UBUNTU_VERSION=24.04
|
|||||||
|
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
|
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
|
||||||
|
|
||||||
# sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
|
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake ca-certificates curl gnupg && \
|
|
||||||
mkdir -p /etc/apt/keyrings && \
|
|
||||||
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
|
|
||||||
gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
|
|
||||||
rm /tmp/nodesource-repo.gpg.key && \
|
|
||||||
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
|
|
||||||
apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends nodejs && \
|
|
||||||
npm install -g pnpm@10.15.1 && \
|
|
||||||
apt-get clean && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
WORKDIR /sd.cpp
|
WORKDIR /sd.cpp
|
||||||
|
|
||||||
|
|||||||
@ -3,18 +3,7 @@ ARG UBUNTU_VERSION=22.04
|
|||||||
|
|
||||||
FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 as build
|
FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 as build
|
||||||
|
|
||||||
# sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
|
RUN apt-get update && apt-get install -y ccache cmake git
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends ccache cmake git ca-certificates curl gnupg && \
|
|
||||||
mkdir -p /etc/apt/keyrings && \
|
|
||||||
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
|
|
||||||
gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
|
|
||||||
rm /tmp/nodesource-repo.gpg.key && \
|
|
||||||
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
|
|
||||||
apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends nodejs && \
|
|
||||||
npm install -g pnpm@10.15.1 && \
|
|
||||||
apt-get clean && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
WORKDIR /sd.cpp
|
WORKDIR /sd.cpp
|
||||||
|
|
||||||
|
|||||||
@ -3,18 +3,7 @@ ARG SYCL_VERSION=2025.3.2-0
|
|||||||
|
|
||||||
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
|
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
|
||||||
|
|
||||||
# sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
|
RUN apt-get update && apt-get install -y cmake
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends cmake ca-certificates curl gnupg && \
|
|
||||||
mkdir -p /etc/apt/keyrings && \
|
|
||||||
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
|
|
||||||
gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
|
|
||||||
rm /tmp/nodesource-repo.gpg.key && \
|
|
||||||
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
|
|
||||||
apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends nodejs && \
|
|
||||||
npm install -g pnpm@10.15.1 && \
|
|
||||||
apt-get clean && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
WORKDIR /sd.cpp
|
WORKDIR /sd.cpp
|
||||||
|
|
||||||
|
|||||||
@ -2,18 +2,7 @@ ARG UBUNTU_VERSION=24.04
|
|||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
# sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
|
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc spirv-headers
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc spirv-headers ca-certificates curl gnupg && \
|
|
||||||
mkdir -p /etc/apt/keyrings && \
|
|
||||||
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
|
|
||||||
gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
|
|
||||||
rm /tmp/nodesource-repo.gpg.key && \
|
|
||||||
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
|
|
||||||
apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends nodejs && \
|
|
||||||
npm install -g pnpm@10.15.1 && \
|
|
||||||
apt-get clean && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
WORKDIR /sd.cpp
|
WORKDIR /sd.cpp
|
||||||
|
|
||||||
|
|||||||
@ -438,10 +438,6 @@ ArgOptions SDContextParams::get_options() {
|
|||||||
};
|
};
|
||||||
|
|
||||||
options.bool_options = {
|
options.bool_options = {
|
||||||
{"",
|
|
||||||
"--stream-layers",
|
|
||||||
"enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)",
|
|
||||||
true, &stream_layers},
|
|
||||||
{"",
|
{"",
|
||||||
"--force-sdxl-vae-conv-scale",
|
"--force-sdxl-vae-conv-scale",
|
||||||
"force use of conv scale on sdxl vae",
|
"force use of conv scale on sdxl vae",
|
||||||
@ -724,7 +720,6 @@ std::string SDContextParams::to_string() const {
|
|||||||
<< " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
|
<< " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
|
||||||
<< " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
|
<< " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
|
||||||
<< " max_vram: " << max_vram << ",\n"
|
<< " max_vram: " << max_vram << ",\n"
|
||||||
<< " stream_layers: " << (stream_layers ? "true" : "false") << ",\n"
|
|
||||||
<< " backend: \"" << backend << "\",\n"
|
<< " backend: \"" << backend << "\",\n"
|
||||||
<< " params_backend: \"" << params_backend << "\",\n"
|
<< " params_backend: \"" << params_backend << "\",\n"
|
||||||
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
|
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
|
||||||
@ -805,7 +800,6 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
|
|||||||
qwen_image_zero_cond_t,
|
qwen_image_zero_cond_t,
|
||||||
str_to_vae_format(vae_format),
|
str_to_vae_format(vae_format),
|
||||||
max_vram,
|
max_vram,
|
||||||
stream_layers,
|
|
||||||
backend.c_str(),
|
backend.c_str(),
|
||||||
params_backend.c_str(),
|
params_backend.c_str(),
|
||||||
};
|
};
|
||||||
|
|||||||
@ -113,7 +113,6 @@ struct SDContextParams {
|
|||||||
rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
|
rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
|
||||||
bool offload_params_to_cpu = false;
|
bool offload_params_to_cpu = false;
|
||||||
float max_vram = 0.f;
|
float max_vram = 0.f;
|
||||||
bool stream_layers = false;
|
|
||||||
std::string backend;
|
std::string backend;
|
||||||
std::string params_backend;
|
std::string params_backend;
|
||||||
bool enable_mmap = false;
|
bool enable_mmap = false;
|
||||||
|
|||||||
@ -222,7 +222,6 @@ typedef struct {
|
|||||||
bool qwen_image_zero_cond_t;
|
bool qwen_image_zero_cond_t;
|
||||||
enum sd_vae_format_t vae_format;
|
enum sd_vae_format_t vae_format;
|
||||||
float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
|
float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
|
||||||
bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
|
|
||||||
const char* backend;
|
const char* backend;
|
||||||
const char* params_backend;
|
const char* params_backend;
|
||||||
} sd_ctx_params_t;
|
} sd_ctx_params_t;
|
||||||
|
|||||||
@ -118,7 +118,6 @@ public:
|
|||||||
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0;
|
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0;
|
||||||
virtual size_t get_params_buffer_size() = 0;
|
virtual size_t get_params_buffer_size() = 0;
|
||||||
virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {}
|
virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {}
|
||||||
virtual void set_stream_layers_enabled(bool enabled) {}
|
|
||||||
virtual void set_flash_attention_enabled(bool enabled) = 0;
|
virtual void set_flash_attention_enabled(bool enabled) = 0;
|
||||||
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
|
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
|
||||||
virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads,
|
virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads,
|
||||||
@ -211,13 +210,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_stream_layers_enabled(bool enabled) override {
|
|
||||||
text_model->set_stream_layers_enabled(enabled);
|
|
||||||
if (sd_version_is_sdxl(version)) {
|
|
||||||
text_model2->set_stream_layers_enabled(enabled);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_flash_attention_enabled(bool enabled) override {
|
void set_flash_attention_enabled(bool enabled) override {
|
||||||
text_model->set_flash_attention_enabled(enabled);
|
text_model->set_flash_attention_enabled(enabled);
|
||||||
if (sd_version_is_sdxl(version)) {
|
if (sd_version_is_sdxl(version)) {
|
||||||
@ -851,18 +843,6 @@ struct SD3CLIPEmbedder : public Conditioner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_stream_layers_enabled(bool enabled) override {
|
|
||||||
if (clip_l) {
|
|
||||||
clip_l->set_stream_layers_enabled(enabled);
|
|
||||||
}
|
|
||||||
if (clip_g) {
|
|
||||||
clip_g->set_stream_layers_enabled(enabled);
|
|
||||||
}
|
|
||||||
if (t5) {
|
|
||||||
t5->set_stream_layers_enabled(enabled);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_flash_attention_enabled(bool enabled) override {
|
void set_flash_attention_enabled(bool enabled) override {
|
||||||
if (clip_l) {
|
if (clip_l) {
|
||||||
clip_l->set_flash_attention_enabled(enabled);
|
clip_l->set_flash_attention_enabled(enabled);
|
||||||
@ -1220,15 +1200,6 @@ struct FluxCLIPEmbedder : public Conditioner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_stream_layers_enabled(bool enabled) override {
|
|
||||||
if (clip_l) {
|
|
||||||
clip_l->set_stream_layers_enabled(enabled);
|
|
||||||
}
|
|
||||||
if (t5) {
|
|
||||||
t5->set_stream_layers_enabled(enabled);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_flash_attention_enabled(bool enabled) override {
|
void set_flash_attention_enabled(bool enabled) override {
|
||||||
if (clip_l) {
|
if (clip_l) {
|
||||||
clip_l->set_flash_attention_enabled(enabled);
|
clip_l->set_flash_attention_enabled(enabled);
|
||||||
@ -1463,12 +1434,6 @@ struct T5CLIPEmbedder : public Conditioner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_stream_layers_enabled(bool enabled) override {
|
|
||||||
if (t5) {
|
|
||||||
t5->set_stream_layers_enabled(enabled);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_flash_attention_enabled(bool enabled) override {
|
void set_flash_attention_enabled(bool enabled) override {
|
||||||
if (t5) {
|
if (t5) {
|
||||||
t5->set_flash_attention_enabled(enabled);
|
t5->set_flash_attention_enabled(enabled);
|
||||||
@ -1652,10 +1617,6 @@ struct AnimaConditioner : public Conditioner {
|
|||||||
llm->set_max_graph_vram_bytes(max_vram_bytes);
|
llm->set_max_graph_vram_bytes(max_vram_bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_stream_layers_enabled(bool enabled) override {
|
|
||||||
llm->set_stream_layers_enabled(enabled);
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_flash_attention_enabled(bool enabled) override {
|
void set_flash_attention_enabled(bool enabled) override {
|
||||||
llm->set_flash_attention_enabled(enabled);
|
llm->set_flash_attention_enabled(enabled);
|
||||||
}
|
}
|
||||||
@ -1804,10 +1765,6 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
llm->set_max_graph_vram_bytes(max_vram_bytes);
|
llm->set_max_graph_vram_bytes(max_vram_bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_stream_layers_enabled(bool enabled) override {
|
|
||||||
llm->set_stream_layers_enabled(enabled);
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_flash_attention_enabled(bool enabled) override {
|
void set_flash_attention_enabled(bool enabled) override {
|
||||||
llm->set_flash_attention_enabled(enabled);
|
llm->set_flash_attention_enabled(enabled);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,7 +28,6 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml_extend_backend.h"
|
#include "ggml_extend_backend.h"
|
||||||
#include "ggml_graph_cut.h"
|
#include "ggml_graph_cut.h"
|
||||||
#include "layer_registry.h"
|
|
||||||
|
|
||||||
#include "model.h"
|
#include "model.h"
|
||||||
#include "tensor.hpp"
|
#include "tensor.hpp"
|
||||||
@ -1698,18 +1697,7 @@ protected:
|
|||||||
ggml_context* partial_offload_ctx = nullptr;
|
ggml_context* partial_offload_ctx = nullptr;
|
||||||
ggml_backend_buffer_t partial_runtime_params_buffer = nullptr;
|
ggml_backend_buffer_t partial_runtime_params_buffer = nullptr;
|
||||||
std::vector<std::pair<ggml_tensor*, ggml_tensor*>> partial_offload_pairs;
|
std::vector<std::pair<ggml_tensor*, ggml_tensor*>> partial_offload_pairs;
|
||||||
|
|
||||||
// Params kept on the runtime backend across streaming segments.
|
|
||||||
ggml_context* resident_offload_ctx = nullptr;
|
|
||||||
std::vector<std::pair<ggml_tensor*, ggml_tensor*>> resident_offload_pairs;
|
|
||||||
ggml_backend_buffer_t resident_runtime_params_buffer = nullptr;
|
|
||||||
std::unordered_set<ggml_tensor*> resident_param_set;
|
|
||||||
uint64_t resident_state_token = 0;
|
|
||||||
|
|
||||||
size_t max_graph_vram_bytes = 0;
|
size_t max_graph_vram_bytes = 0;
|
||||||
bool stream_layers_enabled = false;
|
|
||||||
|
|
||||||
sd::layer_registry::LayerRegistry layer_registry_;
|
|
||||||
|
|
||||||
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
|
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
|
||||||
|
|
||||||
@ -2177,9 +2165,6 @@ protected:
|
|||||||
if (tensor == nullptr) {
|
if (tensor == nullptr) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (resident_param_set.find(tensor) != resident_param_set.end()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (seen_tensors.insert(tensor).second) {
|
if (seen_tensors.insert(tensor).second) {
|
||||||
unique_tensors.push_back(tensor);
|
unique_tensors.push_back(tensor);
|
||||||
}
|
}
|
||||||
@ -2302,114 +2287,6 @@ protected:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool offload_resident_params(const std::vector<ggml_tensor*>& tensors) {
|
|
||||||
if (params_backend == runtime_backend) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (tensors.empty()) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
GGML_ASSERT(resident_runtime_params_buffer == nullptr);
|
|
||||||
GGML_ASSERT(resident_offload_ctx == nullptr);
|
|
||||||
GGML_ASSERT(resident_offload_pairs.empty());
|
|
||||||
GGML_ASSERT(resident_param_set.empty());
|
|
||||||
|
|
||||||
std::vector<ggml_tensor*> unique_tensors;
|
|
||||||
std::unordered_set<ggml_tensor*> seen;
|
|
||||||
unique_tensors.reserve(tensors.size());
|
|
||||||
seen.reserve(tensors.size());
|
|
||||||
for (ggml_tensor* t : tensors) {
|
|
||||||
if (t == nullptr)
|
|
||||||
continue;
|
|
||||||
if (seen.insert(t).second)
|
|
||||||
unique_tensors.push_back(t);
|
|
||||||
}
|
|
||||||
if (unique_tensors.empty())
|
|
||||||
return true;
|
|
||||||
|
|
||||||
ggml_init_params init = {};
|
|
||||||
init.mem_size = std::max<size_t>(1, unique_tensors.size()) * ggml_tensor_overhead();
|
|
||||||
init.mem_buffer = nullptr;
|
|
||||||
init.no_alloc = true;
|
|
||||||
resident_offload_ctx = ggml_init(init);
|
|
||||||
GGML_ASSERT(resident_offload_ctx != nullptr);
|
|
||||||
|
|
||||||
resident_offload_pairs.reserve(unique_tensors.size());
|
|
||||||
for (ggml_tensor* t : unique_tensors) {
|
|
||||||
GGML_ASSERT(t->view_src == nullptr);
|
|
||||||
ggml_tensor* twin = ggml_dup_tensor(resident_offload_ctx, t);
|
|
||||||
ggml_set_name(twin, t->name);
|
|
||||||
resident_offload_pairs.push_back({t, twin});
|
|
||||||
}
|
|
||||||
|
|
||||||
resident_runtime_params_buffer = ggml_backend_alloc_ctx_tensors(resident_offload_ctx, runtime_backend);
|
|
||||||
if (resident_runtime_params_buffer == nullptr) {
|
|
||||||
LOG_ERROR("%s alloc resident runtime params backend buffer failed, num_tensors = %zu",
|
|
||||||
get_desc().c_str(), resident_offload_pairs.size());
|
|
||||||
ggml_free(resident_offload_ctx);
|
|
||||||
resident_offload_ctx = nullptr;
|
|
||||||
resident_offload_pairs.clear();
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
ggml_backend_buffer_set_usage(resident_runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
||||||
|
|
||||||
for (auto& pair : resident_offload_pairs) {
|
|
||||||
ggml_tensor* t = pair.first;
|
|
||||||
ggml_tensor* twin = pair.second;
|
|
||||||
ggml_backend_tensor_copy(t, twin);
|
|
||||||
std::swap(t->buffer, twin->buffer);
|
|
||||||
std::swap(t->data, twin->data);
|
|
||||||
std::swap(t->extra, twin->extra);
|
|
||||||
resident_param_set.insert(t);
|
|
||||||
}
|
|
||||||
ggml_backend_synchronize(runtime_backend);
|
|
||||||
|
|
||||||
size_t sz = ggml_backend_buffer_get_size(resident_runtime_params_buffer);
|
|
||||||
LOG_INFO("%s offload resident params (%6.2f MB, %zu tensors) to runtime backend (%s)",
|
|
||||||
get_desc().c_str(),
|
|
||||||
sz / (1024.f * 1024.f),
|
|
||||||
resident_offload_pairs.size(),
|
|
||||||
ggml_backend_name(runtime_backend));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void restore_resident_params() {
|
|
||||||
if (resident_offload_pairs.empty()) {
|
|
||||||
if (resident_runtime_params_buffer != nullptr) {
|
|
||||||
ggml_backend_buffer_free(resident_runtime_params_buffer);
|
|
||||||
resident_runtime_params_buffer = nullptr;
|
|
||||||
}
|
|
||||||
if (resident_offload_ctx != nullptr) {
|
|
||||||
ggml_free(resident_offload_ctx);
|
|
||||||
resident_offload_ctx = nullptr;
|
|
||||||
}
|
|
||||||
resident_param_set.clear();
|
|
||||||
resident_state_token = 0;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (auto& pair : resident_offload_pairs) {
|
|
||||||
ggml_tensor* t = pair.first;
|
|
||||||
ggml_tensor* twin = pair.second;
|
|
||||||
t->buffer = twin->buffer;
|
|
||||||
t->data = twin->data;
|
|
||||||
t->extra = twin->extra;
|
|
||||||
twin->buffer = nullptr;
|
|
||||||
twin->data = nullptr;
|
|
||||||
twin->extra = nullptr;
|
|
||||||
}
|
|
||||||
if (resident_runtime_params_buffer != nullptr) {
|
|
||||||
ggml_backend_buffer_free(resident_runtime_params_buffer);
|
|
||||||
resident_runtime_params_buffer = nullptr;
|
|
||||||
}
|
|
||||||
resident_offload_pairs.clear();
|
|
||||||
if (resident_offload_ctx != nullptr) {
|
|
||||||
ggml_free(resident_offload_ctx);
|
|
||||||
resident_offload_ctx = nullptr;
|
|
||||||
}
|
|
||||||
resident_param_set.clear();
|
|
||||||
resident_state_token = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool should_use_graph_cut_segmented_compute(const GraphCutPlan& plan) {
|
bool should_use_graph_cut_segmented_compute(const GraphCutPlan& plan) {
|
||||||
return plan.has_cuts &&
|
return plan.has_cuts &&
|
||||||
plan.valid &&
|
plan.valid &&
|
||||||
@ -2426,80 +2303,20 @@ protected:
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool resolve_graph_cut_plan(ggml_cgraph* gf,
|
bool resolve_graph_cut_plan(ggml_cgraph* gf,
|
||||||
GraphCutPlan* plan_out,
|
GraphCutPlan* plan_out) {
|
||||||
size_t* effective_budget_out = nullptr) {
|
|
||||||
GGML_ASSERT(plan_out != nullptr);
|
GGML_ASSERT(plan_out != nullptr);
|
||||||
GGML_ASSERT(gf != nullptr);
|
GGML_ASSERT(gf != nullptr);
|
||||||
|
|
||||||
// Keep the plan and resident params under the same live-VRAM cap.
|
|
||||||
size_t effective_budget = max_graph_vram_bytes;
|
|
||||||
if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) {
|
|
||||||
ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend);
|
|
||||||
if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
|
||||||
size_t free_vram = 0, total_vram = 0;
|
|
||||||
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
|
|
||||||
constexpr size_t safety_margin = 512ull * 1024 * 1024;
|
|
||||||
size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
|
|
||||||
if (free_clamp < effective_budget) {
|
|
||||||
LOG_INFO("%s clamping streaming budget: actual free VRAM %.2f MB < user cap %.2f MB",
|
|
||||||
get_desc().c_str(),
|
|
||||||
free_clamp / (1024.0 * 1024.0),
|
|
||||||
effective_budget / (1024.0 * 1024.0));
|
|
||||||
effective_budget = free_clamp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (effective_budget_out != nullptr) {
|
|
||||||
*effective_budget_out = effective_budget;
|
|
||||||
}
|
|
||||||
|
|
||||||
*plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend,
|
*plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend,
|
||||||
gf,
|
gf,
|
||||||
&graph_cut_plan_cache_,
|
&graph_cut_plan_cache_,
|
||||||
effective_budget,
|
max_graph_vram_bytes,
|
||||||
params_tensor_set_,
|
params_tensor_set_,
|
||||||
get_desc().c_str());
|
get_desc().c_str());
|
||||||
if (stream_layers_enabled) {
|
|
||||||
LOG_INFO("%s streaming budget = %.2f MB",
|
|
||||||
get_desc().c_str(),
|
|
||||||
effective_budget / (1024.0 * 1024.0));
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct PersistentExternalBinding {
|
|
||||||
ggml_backend_buffer_t buffer = nullptr;
|
|
||||||
void* data = nullptr;
|
|
||||||
void* extra = nullptr;
|
|
||||||
};
|
|
||||||
|
|
||||||
void snapshot_persistent_externals(const sd::ggml_graph_cut::Plan& plan,
|
|
||||||
ggml_cgraph* gf,
|
|
||||||
std::unordered_map<ggml_tensor*, PersistentExternalBinding>& out) {
|
|
||||||
GGML_ASSERT(gf != nullptr);
|
|
||||||
out.clear();
|
|
||||||
for (const auto& segment : plan.segments) {
|
|
||||||
for (const auto& input : segment.input_refs) {
|
|
||||||
if (input.type != GraphCutSegment::INPUT_EXTERNAL) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
ggml_tensor* tensor = sd::ggml_graph_cut::input_tensor(gf, input);
|
|
||||||
if (tensor == nullptr || tensor->buffer == nullptr) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
PersistentExternalBinding binding;
|
|
||||||
binding.buffer = tensor->buffer;
|
|
||||||
binding.data = tensor->data;
|
|
||||||
binding.extra = tensor->extra;
|
|
||||||
out[tensor] = binding;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void reset_segment_runtime_tensors(const GraphCutSegment& segment,
|
void reset_segment_runtime_tensors(const GraphCutSegment& segment,
|
||||||
ggml_cgraph* gf,
|
ggml_cgraph* gf) {
|
||||||
const std::unordered_map<ggml_tensor*, PersistentExternalBinding>* persistent_externals = nullptr) {
|
|
||||||
GGML_ASSERT(gf != nullptr);
|
GGML_ASSERT(gf != nullptr);
|
||||||
|
|
||||||
for (const auto& input : segment.input_refs) {
|
for (const auto& input : segment.input_refs) {
|
||||||
@ -2509,25 +2326,11 @@ protected:
|
|||||||
}
|
}
|
||||||
switch (input.type) {
|
switch (input.type) {
|
||||||
case GraphCutSegment::INPUT_PREVIOUS_CUT:
|
case GraphCutSegment::INPUT_PREVIOUS_CUT:
|
||||||
|
case GraphCutSegment::INPUT_EXTERNAL:
|
||||||
input_tensor->buffer = nullptr;
|
input_tensor->buffer = nullptr;
|
||||||
input_tensor->data = nullptr;
|
input_tensor->data = nullptr;
|
||||||
input_tensor->extra = nullptr;
|
input_tensor->extra = nullptr;
|
||||||
break;
|
break;
|
||||||
case GraphCutSegment::INPUT_EXTERNAL: {
|
|
||||||
if (persistent_externals != nullptr) {
|
|
||||||
auto it = persistent_externals->find(input_tensor);
|
|
||||||
if (it != persistent_externals->end()) {
|
|
||||||
input_tensor->buffer = it->second.buffer;
|
|
||||||
input_tensor->data = it->second.data;
|
|
||||||
input_tensor->extra = it->second.extra;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
input_tensor->buffer = nullptr;
|
|
||||||
input_tensor->data = nullptr;
|
|
||||||
input_tensor->extra = nullptr;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case GraphCutSegment::INPUT_PARAM:
|
case GraphCutSegment::INPUT_PARAM:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -2742,9 +2545,6 @@ protected:
|
|||||||
free_compute_buffer();
|
free_compute_buffer();
|
||||||
free_cache_ctx_and_buffer();
|
free_cache_ctx_and_buffer();
|
||||||
|
|
||||||
std::unordered_map<ggml_tensor*, PersistentExternalBinding> persistent_externals;
|
|
||||||
snapshot_persistent_externals(plan, gf, persistent_externals);
|
|
||||||
|
|
||||||
std::optional<sd::Tensor<T>> output = sd::Tensor<T>();
|
std::optional<sd::Tensor<T>> output = sd::Tensor<T>();
|
||||||
for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) {
|
for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) {
|
||||||
int64_t t_segment_begin = ggml_time_ms();
|
int64_t t_segment_begin = ggml_time_ms();
|
||||||
@ -2756,7 +2556,7 @@ protected:
|
|||||||
plan.segments.size(),
|
plan.segments.size(),
|
||||||
segment.group_name.c_str());
|
segment.group_name.c_str());
|
||||||
|
|
||||||
reset_segment_runtime_tensors(segment, gf, &persistent_externals);
|
reset_segment_runtime_tensors(segment, gf);
|
||||||
if (!bind_segment_cached_inputs(gf, segment)) {
|
if (!bind_segment_cached_inputs(gf, segment)) {
|
||||||
free_cache_ctx_and_buffer();
|
free_cache_ctx_and_buffer();
|
||||||
free_compute_buffer();
|
free_compute_buffer();
|
||||||
@ -2801,135 +2601,6 @@ protected:
|
|||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
|
||||||
void release_streaming_residency() {
|
|
||||||
restore_resident_params();
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
std::optional<sd::Tensor<T>> compute_streaming_segments(ggml_cgraph* gf,
|
|
||||||
const GraphCutPlan& plan,
|
|
||||||
size_t residency_budget_bytes,
|
|
||||||
int n_threads,
|
|
||||||
bool free_compute_buffer_immediately,
|
|
||||||
bool no_return = false) {
|
|
||||||
GGML_ASSERT(gf != nullptr);
|
|
||||||
|
|
||||||
// Runtime LoRA mutates CPU weights between calls, so resident GPU
|
|
||||||
// copies would go stale.
|
|
||||||
if (weight_adapter != nullptr) {
|
|
||||||
restore_resident_params();
|
|
||||||
} else {
|
|
||||||
sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
|
|
||||||
if (base_plan.available) {
|
|
||||||
sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes);
|
|
||||||
|
|
||||||
std::vector<ggml_tensor*> resident_params;
|
|
||||||
uint64_t token = 0;
|
|
||||||
for (const auto& segment : base_plan.segments) {
|
|
||||||
if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment);
|
|
||||||
for (ggml_tensor* t : seg_params) {
|
|
||||||
if (t == nullptr)
|
|
||||||
continue;
|
|
||||||
resident_params.push_back(t);
|
|
||||||
token ^= reinterpret_cast<uintptr_t>(t) * 0x9E3779B97F4A7C15ull;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (token != resident_state_token) {
|
|
||||||
restore_resident_params();
|
|
||||||
if (!resident_params.empty()) {
|
|
||||||
if (offload_resident_params(resident_params)) {
|
|
||||||
resident_state_token = token;
|
|
||||||
} else {
|
|
||||||
LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming",
|
|
||||||
get_desc().c_str());
|
|
||||||
restore_resident_params();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
free_compute_buffer();
|
|
||||||
free_cache_ctx_and_buffer();
|
|
||||||
|
|
||||||
layer_registry_.move_layer_to_gpu("_global");
|
|
||||||
|
|
||||||
std::unordered_map<ggml_tensor*, PersistentExternalBinding> persistent_externals;
|
|
||||||
snapshot_persistent_externals(plan, gf, persistent_externals);
|
|
||||||
|
|
||||||
std::optional<sd::Tensor<T>> output = sd::Tensor<T>();
|
|
||||||
for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) {
|
|
||||||
int64_t t_segment_begin = ggml_time_ms();
|
|
||||||
const auto& segment = plan.segments[seg_idx];
|
|
||||||
const bool is_last = seg_idx + 1 == plan.segments.size();
|
|
||||||
auto future_cut_names = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx);
|
|
||||||
|
|
||||||
LOG_DEBUG("%s streaming-cut executing segment %zu/%zu: %s (residency=%s)",
|
|
||||||
get_desc().c_str(),
|
|
||||||
seg_idx + 1,
|
|
||||||
plan.segments.size(),
|
|
||||||
segment.group_name.c_str(),
|
|
||||||
segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT ? "RESIDENT" : "STREAMED");
|
|
||||||
|
|
||||||
if (!layer_registry_.move_layer_to_gpu(segment.group_name)) {
|
|
||||||
LOG_DEBUG("%s streaming: no registry entry for group '%s' (using upstream offload path)",
|
|
||||||
get_desc().c_str(),
|
|
||||||
segment.group_name.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
reset_segment_runtime_tensors(segment, gf, &persistent_externals);
|
|
||||||
if (!bind_segment_cached_inputs(gf, segment)) {
|
|
||||||
free_cache_ctx_and_buffer();
|
|
||||||
free_compute_buffer();
|
|
||||||
free_compute_ctx();
|
|
||||||
return std::nullopt;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!is_last) {
|
|
||||||
for (size_t output_idx = 0; output_idx < segment.output_node_indices.size(); ++output_idx) {
|
|
||||||
ggml_tensor* out_tensor = sd::ggml_graph_cut::output_tensor(gf, segment, output_idx);
|
|
||||||
if (out_tensor != nullptr &&
|
|
||||||
sd::ggml_graph_cut::is_graph_cut_tensor(out_tensor) &&
|
|
||||||
future_cut_names.find(out_tensor->name) != future_cut_names.end()) {
|
|
||||||
cache(out_tensor->name, out_tensor);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_context* segment_graph_ctx = nullptr;
|
|
||||||
ggml_cgraph* segment_graph = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx);
|
|
||||||
auto segment_output = execute_graph<T>(segment_graph,
|
|
||||||
n_threads,
|
|
||||||
/*free_compute_buffer_immediately=*/true,
|
|
||||||
sd::ggml_graph_cut::runtime_param_tensors(gf, segment, get_desc().c_str()),
|
|
||||||
/*preserve_backend_tensor_data_map=*/true,
|
|
||||||
/*no_return=*/!is_last || no_return,
|
|
||||||
&future_cut_names);
|
|
||||||
ggml_free(segment_graph_ctx);
|
|
||||||
if (!segment_output.has_value()) {
|
|
||||||
free_cache_ctx_and_buffer();
|
|
||||||
free_compute_buffer();
|
|
||||||
free_compute_ctx();
|
|
||||||
return std::nullopt;
|
|
||||||
}
|
|
||||||
output = std::move(segment_output);
|
|
||||||
|
|
||||||
if (segment.residency == sd::ggml_graph_cut::SegmentResidency::STREAMED) {
|
|
||||||
layer_registry_.move_layer_to_cpu(segment.group_name);
|
|
||||||
}
|
|
||||||
(void)t_segment_begin;
|
|
||||||
}
|
|
||||||
|
|
||||||
backend_tensor_data_map.clear();
|
|
||||||
free_cache_ctx_and_buffer();
|
|
||||||
free_compute_ctx();
|
|
||||||
return output;
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
virtual std::string get_desc() = 0;
|
virtual std::string get_desc() = 0;
|
||||||
|
|
||||||
@ -2939,11 +2610,9 @@ public:
|
|||||||
GGML_ASSERT(runtime_backend != nullptr);
|
GGML_ASSERT(runtime_backend != nullptr);
|
||||||
GGML_ASSERT(params_backend != nullptr);
|
GGML_ASSERT(params_backend != nullptr);
|
||||||
alloc_params_ctx();
|
alloc_params_ctx();
|
||||||
layer_registry_.set_backends(runtime_backend, params_backend);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual ~GGMLRunner() {
|
virtual ~GGMLRunner() {
|
||||||
restore_resident_params();
|
|
||||||
free_params_buffer();
|
free_params_buffer();
|
||||||
free_compute_buffer();
|
free_compute_buffer();
|
||||||
free_params_ctx();
|
free_params_ctx();
|
||||||
@ -3016,8 +2685,6 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void free_params_buffer() {
|
void free_params_buffer() {
|
||||||
// Restore swapped resident params before freeing their backing buffer.
|
|
||||||
restore_resident_params();
|
|
||||||
if (params_buffer != nullptr) {
|
if (params_buffer != nullptr) {
|
||||||
ggml_backend_buffer_free(params_buffer);
|
ggml_backend_buffer_free(params_buffer);
|
||||||
params_buffer = nullptr;
|
params_buffer = nullptr;
|
||||||
@ -3117,20 +2784,11 @@ public:
|
|||||||
|
|
||||||
if (can_attempt_graph_cut_segmented_compute()) {
|
if (can_attempt_graph_cut_segmented_compute()) {
|
||||||
GraphCutPlan plan;
|
GraphCutPlan plan;
|
||||||
size_t effective_graph_vram_bytes = 0;
|
if (!resolve_graph_cut_plan(gf, &plan)) {
|
||||||
if (!resolve_graph_cut_plan(gf, &plan, &effective_graph_vram_bytes)) {
|
|
||||||
free_compute_ctx();
|
free_compute_ctx();
|
||||||
return std::nullopt;
|
return std::nullopt;
|
||||||
}
|
}
|
||||||
if (should_use_graph_cut_segmented_compute(plan)) {
|
if (should_use_graph_cut_segmented_compute(plan)) {
|
||||||
if (stream_layers_enabled) {
|
|
||||||
return compute_streaming_segments<T>(gf,
|
|
||||||
plan,
|
|
||||||
effective_graph_vram_bytes,
|
|
||||||
n_threads,
|
|
||||||
free_compute_buffer_immediately,
|
|
||||||
no_return);
|
|
||||||
}
|
|
||||||
return compute_with_graph_cuts<T>(gf,
|
return compute_with_graph_cuts<T>(gf,
|
||||||
plan,
|
plan,
|
||||||
n_threads,
|
n_threads,
|
||||||
@ -3171,12 +2829,6 @@ public:
|
|||||||
max_graph_vram_bytes = max_vram_bytes;
|
max_graph_vram_bytes = max_vram_bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_stream_layers_enabled(bool enabled) {
|
|
||||||
stream_layers_enabled = enabled;
|
|
||||||
}
|
|
||||||
|
|
||||||
sd::layer_registry::LayerRegistry& get_layer_registry() { return layer_registry_; }
|
|
||||||
|
|
||||||
ggml_backend_t get_runtime_backend() {
|
ggml_backend_t get_runtime_backend() {
|
||||||
return runtime_backend;
|
return runtime_backend;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -753,54 +753,4 @@ namespace sd::ggml_graph_cut {
|
|||||||
return resolved_plan;
|
return resolved_plan;
|
||||||
}
|
}
|
||||||
|
|
||||||
void annotate_residency(Plan& plan, size_t max_graph_vram_bytes) {
|
|
||||||
// Cached plans may be reused with a smaller live budget.
|
|
||||||
for (auto& seg : plan.segments) {
|
|
||||||
seg.residency = SegmentResidency::STREAMED;
|
|
||||||
}
|
|
||||||
if (max_graph_vram_bytes == 0 || plan.segments.size() < 2) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool any_param_bearing = false;
|
|
||||||
for (const auto& seg : plan.segments) {
|
|
||||||
if (seg.input_param_bytes > 0) {
|
|
||||||
any_param_bearing = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!any_param_bearing) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Leave room for the largest active streamed segment.
|
|
||||||
size_t worst_streamed_footprint = 0;
|
|
||||||
for (const auto& seg : plan.segments) {
|
|
||||||
const size_t seg_footprint = seg.input_param_bytes +
|
|
||||||
seg.compute_buffer_size +
|
|
||||||
seg.output_bytes +
|
|
||||||
seg.input_previous_cut_bytes +
|
|
||||||
seg.input_external_bytes;
|
|
||||||
if (seg_footprint > worst_streamed_footprint) {
|
|
||||||
worst_streamed_footprint = seg_footprint;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
constexpr size_t safety = 512ull * 1024 * 1024;
|
|
||||||
const size_t reserved = safety + worst_streamed_footprint;
|
|
||||||
|
|
||||||
if (max_graph_vram_bytes <= reserved) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const size_t available = max_graph_vram_bytes - reserved;
|
|
||||||
|
|
||||||
size_t cumulative = 0;
|
|
||||||
for (auto& seg : plan.segments) {
|
|
||||||
if (cumulative + seg.input_param_bytes > available) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
seg.residency = SegmentResidency::RESIDENT;
|
|
||||||
cumulative += seg.input_param_bytes;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace sd::ggml_graph_cut
|
} // namespace sd::ggml_graph_cut
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
#define __SD_GGML_GRAPH_CUT_H__
|
#define __SD_GGML_GRAPH_CUT_H__
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <cstdint>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -12,12 +11,6 @@
|
|||||||
|
|
||||||
namespace sd::ggml_graph_cut {
|
namespace sd::ggml_graph_cut {
|
||||||
|
|
||||||
// Streaming residency for a segment's params.
|
|
||||||
enum class SegmentResidency : uint8_t {
|
|
||||||
STREAMED = 0,
|
|
||||||
RESIDENT = 1,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct Segment {
|
struct Segment {
|
||||||
enum InputType {
|
enum InputType {
|
||||||
INPUT_EXTERNAL = 0,
|
INPUT_EXTERNAL = 0,
|
||||||
@ -41,7 +34,6 @@ namespace sd::ggml_graph_cut {
|
|||||||
std::vector<int> internal_node_indices;
|
std::vector<int> internal_node_indices;
|
||||||
std::vector<int> output_node_indices;
|
std::vector<int> output_node_indices;
|
||||||
std::vector<InputRef> input_refs;
|
std::vector<InputRef> input_refs;
|
||||||
SegmentResidency residency = SegmentResidency::STREAMED;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Plan {
|
struct Plan {
|
||||||
@ -109,9 +101,6 @@ namespace sd::ggml_graph_cut {
|
|||||||
size_t max_graph_vram_bytes,
|
size_t max_graph_vram_bytes,
|
||||||
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
|
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
|
||||||
const char* log_desc);
|
const char* log_desc);
|
||||||
|
|
||||||
// Mark leading segments resident when they fit after streamed-segment headroom.
|
|
||||||
void annotate_residency(Plan& plan, size_t max_graph_vram_bytes);
|
|
||||||
} // namespace sd::ggml_graph_cut
|
} // namespace sd::ggml_graph_cut
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -1,132 +0,0 @@
|
|||||||
#include "layer_registry.h"
|
|
||||||
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
#include "util.h"
|
|
||||||
|
|
||||||
namespace sd::layer_registry {
|
|
||||||
|
|
||||||
void LayerRegistry::register_layer(const std::string& name, ggml_tensor* tensor) {
|
|
||||||
auto& info = layers_[name];
|
|
||||||
info.tensors.push_back(tensor);
|
|
||||||
info.bytes += ggml_nbytes(tensor);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool LayerRegistry::move_layer_to_gpu(const std::string& name) {
|
|
||||||
auto it = layers_.find(name);
|
|
||||||
if (it == layers_.end())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
LayerInfo& info = it->second;
|
|
||||||
if (info.on_gpu)
|
|
||||||
return true;
|
|
||||||
if (gpu_backend_ == nullptr || cpu_backend_ == nullptr) {
|
|
||||||
LOG_ERROR("layer_registry: backends not set; cannot move '%s' to GPU",
|
|
||||||
name.c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (info.tensors.empty()) {
|
|
||||||
info.on_gpu = true;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 1. Build a no_alloc context big enough to hold one twin tensor per CPU
|
|
||||||
// tensor, plus a little overhead.
|
|
||||||
const size_t ctx_size = info.tensors.size() * ggml_tensor_overhead() + 1024;
|
|
||||||
ggml_init_params ctx_params{ctx_size, /*mem_buffer=*/nullptr, /*no_alloc=*/true};
|
|
||||||
ggml_context* twin_ctx = ggml_init(ctx_params);
|
|
||||||
if (twin_ctx == nullptr) {
|
|
||||||
LOG_ERROR("layer_registry: failed to allocate twin context for '%s'",
|
|
||||||
name.c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2. Create one GPU twin per CPU tensor. The twin shares the original
|
|
||||||
// name so any name-based lookup keeps working.
|
|
||||||
std::vector<ggml_tensor*> gpu_twins;
|
|
||||||
gpu_twins.reserve(info.tensors.size());
|
|
||||||
for (ggml_tensor* cpu_t : info.tensors) {
|
|
||||||
ggml_tensor* twin = ggml_dup_tensor(twin_ctx, cpu_t);
|
|
||||||
if (cpu_t->name[0] != '\0') {
|
|
||||||
ggml_set_name(twin, cpu_t->name);
|
|
||||||
}
|
|
||||||
gpu_twins.push_back(twin);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. Back the twins with a GPU buffer in one alloc call.
|
|
||||||
ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_ctx_tensors(twin_ctx, gpu_backend_);
|
|
||||||
if (gpu_buffer == nullptr) {
|
|
||||||
LOG_ERROR("layer_registry: failed to allocate GPU buffer for '%s'",
|
|
||||||
name.c_str());
|
|
||||||
ggml_free(twin_ctx);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4. H2D copy + sync.
|
|
||||||
for (size_t i = 0; i < info.tensors.size(); ++i) {
|
|
||||||
ggml_backend_tensor_copy(info.tensors[i], gpu_twins[i]);
|
|
||||||
}
|
|
||||||
ggml_backend_synchronize(gpu_backend_);
|
|
||||||
|
|
||||||
// 5. Swap buffer/data/extra so the originals now point at GPU memory.
|
|
||||||
for (size_t i = 0; i < info.tensors.size(); ++i) {
|
|
||||||
std::swap(info.tensors[i]->buffer, gpu_twins[i]->buffer);
|
|
||||||
std::swap(info.tensors[i]->data, gpu_twins[i]->data);
|
|
||||||
std::swap(info.tensors[i]->extra, gpu_twins[i]->extra);
|
|
||||||
}
|
|
||||||
|
|
||||||
info.gpu_twins = std::move(gpu_twins);
|
|
||||||
info.twin_ctx = twin_ctx;
|
|
||||||
info.gpu_buffer = gpu_buffer;
|
|
||||||
info.on_gpu = true;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool LayerRegistry::move_layer_to_cpu(const std::string& name) {
|
|
||||||
auto it = layers_.find(name);
|
|
||||||
if (it == layers_.end())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
LayerInfo& info = it->second;
|
|
||||||
if (!info.on_gpu)
|
|
||||||
return true;
|
|
||||||
if (info.tensors.size() != info.gpu_twins.size()) {
|
|
||||||
LOG_ERROR("layer_registry: twin/tensor count mismatch for '%s'",
|
|
||||||
name.c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 1. Swap back: originals point at CPU memory again.
|
|
||||||
for (size_t i = 0; i < info.tensors.size(); ++i) {
|
|
||||||
if (info.gpu_twins[i] == nullptr)
|
|
||||||
continue;
|
|
||||||
std::swap(info.tensors[i]->buffer, info.gpu_twins[i]->buffer);
|
|
||||||
std::swap(info.tensors[i]->data, info.gpu_twins[i]->data);
|
|
||||||
std::swap(info.tensors[i]->extra, info.gpu_twins[i]->extra);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2. Free the GPU buffer + twin context.
|
|
||||||
if (info.gpu_buffer != nullptr) {
|
|
||||||
ggml_backend_buffer_free(info.gpu_buffer);
|
|
||||||
info.gpu_buffer = nullptr;
|
|
||||||
}
|
|
||||||
if (info.twin_ctx != nullptr) {
|
|
||||||
ggml_free(info.twin_ctx);
|
|
||||||
info.twin_ctx = nullptr;
|
|
||||||
}
|
|
||||||
info.gpu_twins.clear();
|
|
||||||
info.on_gpu = false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool LayerRegistry::is_layer_on_gpu(const std::string& name) const {
|
|
||||||
auto it = layers_.find(name);
|
|
||||||
return it != layers_.end() && it->second.on_gpu;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t LayerRegistry::get_layer_size(const std::string& name) const {
|
|
||||||
auto it = layers_.find(name);
|
|
||||||
return it != layers_.end() ? it->second.bytes : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace sd::layer_registry
|
|
||||||
@ -1,50 +0,0 @@
|
|||||||
#ifndef __LAYER_REGISTRY_H__
|
|
||||||
#define __LAYER_REGISTRY_H__
|
|
||||||
|
|
||||||
#include <map>
|
|
||||||
#include <set>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
namespace sd::layer_registry {
|
|
||||||
|
|
||||||
struct LayerInfo {
|
|
||||||
std::vector<ggml_tensor*> tensors;
|
|
||||||
std::vector<ggml_tensor*> gpu_twins;
|
|
||||||
ggml_context* twin_ctx = nullptr;
|
|
||||||
ggml_backend_buffer_t gpu_buffer = nullptr;
|
|
||||||
bool on_gpu = false;
|
|
||||||
size_t bytes = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
class LayerRegistry {
|
|
||||||
public:
|
|
||||||
LayerRegistry() = default;
|
|
||||||
LayerRegistry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend)
|
|
||||||
: gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {}
|
|
||||||
|
|
||||||
void set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) {
|
|
||||||
gpu_backend_ = gpu_backend;
|
|
||||||
cpu_backend_ = cpu_backend;
|
|
||||||
}
|
|
||||||
void register_layer(const std::string& name, ggml_tensor* tensor);
|
|
||||||
bool move_layer_to_gpu(const std::string& name);
|
|
||||||
bool move_layer_to_cpu(const std::string& name);
|
|
||||||
bool is_layer_on_gpu(const std::string& name) const;
|
|
||||||
size_t get_layer_size(const std::string& name) const;
|
|
||||||
size_t get_layer_count() const { return layers_.size(); }
|
|
||||||
|
|
||||||
const std::map<std::string, LayerInfo>& layers() const { return layers_; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
ggml_backend_t gpu_backend_ = nullptr;
|
|
||||||
ggml_backend_t cpu_backend_ = nullptr;
|
|
||||||
std::map<std::string, LayerInfo> layers_;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace sd::layer_registry
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@ -189,7 +189,6 @@ public:
|
|||||||
sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr};
|
sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr};
|
||||||
bool offload_params_to_cpu = false;
|
bool offload_params_to_cpu = false;
|
||||||
float max_vram = 0.f;
|
float max_vram = 0.f;
|
||||||
bool stream_layers = false;
|
|
||||||
bool use_pmid = false;
|
bool use_pmid = false;
|
||||||
std::string backend_spec;
|
std::string backend_spec;
|
||||||
std::string params_backend_spec;
|
std::string params_backend_spec;
|
||||||
@ -235,7 +234,7 @@ public:
|
|||||||
std::string error;
|
std::string error;
|
||||||
if (!backend_manager.init(sd_ctx_params->backend,
|
if (!backend_manager.init(sd_ctx_params->backend,
|
||||||
sd_ctx_params->params_backend,
|
sd_ctx_params->params_backend,
|
||||||
offload_params_to_cpu,
|
sd_ctx_params->offload_params_to_cpu,
|
||||||
sd_ctx_params->keep_clip_on_cpu,
|
sd_ctx_params->keep_clip_on_cpu,
|
||||||
sd_ctx_params->keep_vae_on_cpu,
|
sd_ctx_params->keep_vae_on_cpu,
|
||||||
sd_ctx_params->keep_control_net_on_cpu,
|
sd_ctx_params->keep_control_net_on_cpu,
|
||||||
@ -262,18 +261,8 @@ public:
|
|||||||
free_params_immediately = sd_ctx_params->free_params_immediately;
|
free_params_immediately = sd_ctx_params->free_params_immediately;
|
||||||
offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu;
|
offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu;
|
||||||
max_vram = sd_ctx_params->max_vram;
|
max_vram = sd_ctx_params->max_vram;
|
||||||
stream_layers = sd_ctx_params->stream_layers;
|
|
||||||
backend_spec = SAFE_STR(sd_ctx_params->backend);
|
backend_spec = SAFE_STR(sd_ctx_params->backend);
|
||||||
params_backend_spec = SAFE_STR(sd_ctx_params->params_backend);
|
params_backend_spec = SAFE_STR(sd_ctx_params->params_backend);
|
||||||
if (stream_layers && max_vram == 0.f) {
|
|
||||||
LOG_WARN("--stream-layers has no effect without --max-vram set; ignoring");
|
|
||||||
stream_layers = false;
|
|
||||||
}
|
|
||||||
if (stream_layers && !offload_params_to_cpu && params_backend_spec.empty()) {
|
|
||||||
// Streaming needs CPU-resident params.
|
|
||||||
LOG_WARN("--stream-layers has no effect without --offload-to-cpu (or --params-backend); ignoring");
|
|
||||||
stream_layers = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool use_tae = false;
|
bool use_tae = false;
|
||||||
bool use_audio_vae = false;
|
bool use_audio_vae = false;
|
||||||
@ -452,10 +441,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Avoid full-model LoRA merge buffers on constrained setups.
|
if (have_quantized_weight) {
|
||||||
const bool streaming_constrained = stream_layers ||
|
|
||||||
sd_ctx_params->offload_params_to_cpu;
|
|
||||||
if (have_quantized_weight || streaming_constrained) {
|
|
||||||
apply_lora_immediately = false;
|
apply_lora_immediately = false;
|
||||||
} else {
|
} else {
|
||||||
apply_lora_immediately = true;
|
apply_lora_immediately = true;
|
||||||
@ -751,7 +737,6 @@ public:
|
|||||||
get_param_tensors(cond_stage_model, module_can_mmap(SDBackendModule::TE));
|
get_param_tensors(cond_stage_model, module_can_mmap(SDBackendModule::TE));
|
||||||
|
|
||||||
diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
||||||
diffusion_model->set_stream_layers_enabled(stream_layers);
|
|
||||||
get_param_tensors(diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION));
|
get_param_tensors(diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION));
|
||||||
|
|
||||||
if (sd_version_is_unet_edit(version)) {
|
if (sd_version_is_unet_edit(version)) {
|
||||||
@ -760,7 +745,6 @@ public:
|
|||||||
|
|
||||||
if (high_noise_diffusion_model) {
|
if (high_noise_diffusion_model) {
|
||||||
high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
||||||
high_noise_diffusion_model->set_stream_layers_enabled(stream_layers);
|
|
||||||
get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION));
|
get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2380,15 +2364,6 @@ public:
|
|||||||
if (sd_version_is_pid(version)) {
|
if (sd_version_is_pid(version)) {
|
||||||
return sd::ops::clamp((x + 1.f) * 0.5f, 0.0f, 1.0f);
|
return sd::ops::clamp((x + 1.f) * 0.5f, 0.0f, 1.0f);
|
||||||
}
|
}
|
||||||
// Free resident diffusion params before VAE allocates its compute buffer.
|
|
||||||
if (stream_layers) {
|
|
||||||
if (diffusion_model) {
|
|
||||||
diffusion_model->release_streaming_residency();
|
|
||||||
}
|
|
||||||
if (high_noise_diffusion_model) {
|
|
||||||
high_noise_diffusion_model->release_streaming_residency();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
auto latents = first_stage_model->diffusion_to_vae_latents(x);
|
auto latents = first_stage_model->diffusion_to_vae_latents(x);
|
||||||
first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling);
|
first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling);
|
||||||
return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
|
return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
|
||||||
@ -2733,7 +2708,6 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
|
|||||||
sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO;
|
sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO;
|
||||||
sd_ctx_params->offload_params_to_cpu = false;
|
sd_ctx_params->offload_params_to_cpu = false;
|
||||||
sd_ctx_params->max_vram = 0.f;
|
sd_ctx_params->max_vram = 0.f;
|
||||||
sd_ctx_params->stream_layers = false;
|
|
||||||
sd_ctx_params->enable_mmap = false;
|
sd_ctx_params->enable_mmap = false;
|
||||||
sd_ctx_params->keep_clip_on_cpu = false;
|
sd_ctx_params->keep_clip_on_cpu = false;
|
||||||
sd_ctx_params->keep_control_net_on_cpu = false;
|
sd_ctx_params->keep_control_net_on_cpu = false;
|
||||||
@ -2781,7 +2755,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
"prediction: %s\n"
|
"prediction: %s\n"
|
||||||
"offload_params_to_cpu: %s\n"
|
"offload_params_to_cpu: %s\n"
|
||||||
"max_vram: %.3f\n"
|
"max_vram: %.3f\n"
|
||||||
"stream_layers: %s\n"
|
|
||||||
"backend: %s\n"
|
"backend: %s\n"
|
||||||
"params_backend: %s\n"
|
"params_backend: %s\n"
|
||||||
"keep_clip_on_cpu: %s\n"
|
"keep_clip_on_cpu: %s\n"
|
||||||
@ -2820,7 +2793,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
sd_prediction_name(sd_ctx_params->prediction),
|
sd_prediction_name(sd_ctx_params->prediction),
|
||||||
BOOL_STR(sd_ctx_params->offload_params_to_cpu),
|
BOOL_STR(sd_ctx_params->offload_params_to_cpu),
|
||||||
sd_ctx_params->max_vram,
|
sd_ctx_params->max_vram,
|
||||||
BOOL_STR(sd_ctx_params->stream_layers),
|
|
||||||
SAFE_STR(sd_ctx_params->backend),
|
SAFE_STR(sd_ctx_params->backend),
|
||||||
SAFE_STR(sd_ctx_params->params_backend),
|
SAFE_STR(sd_ctx_params->params_backend),
|
||||||
BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
|
BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
|
||||||
@ -4192,7 +4164,7 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
|
|||||||
std::vector<sd::Tensor<float>> empty_ref_images;
|
std::vector<sd::Tensor<float>> empty_ref_images;
|
||||||
condition_params.ref_images = &empty_ref_images;
|
condition_params.ref_images = &empty_ref_images;
|
||||||
uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
|
uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
|
||||||
condition_params);
|
condition_params);
|
||||||
if (uncond.c_concat.empty()) {
|
if (uncond.c_concat.empty()) {
|
||||||
uncond.c_concat = latents->uncond_concat_latent; // TODO: optimize
|
uncond.c_concat = latents->uncond_concat_latent; // TODO: optimize
|
||||||
}
|
}
|
||||||
@ -4210,9 +4182,9 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
|
|||||||
|
|
||||||
ImageGenerationEmbeds embeds;
|
ImageGenerationEmbeds embeds;
|
||||||
embeds.img_cond = std::move(img_cond);
|
embeds.img_cond = std::move(img_cond);
|
||||||
embeds.cond = std::move(cond);
|
embeds.cond = std::move(cond);
|
||||||
embeds.uncond = std::move(uncond);
|
embeds.uncond = std::move(uncond);
|
||||||
embeds.id_cond = std::move(id_cond);
|
embeds.id_cond = std::move(id_cond);
|
||||||
|
|
||||||
return embeds;
|
return embeds;
|
||||||
}
|
}
|
||||||
@ -4880,17 +4852,6 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
|
|||||||
latents.denoise_mask = sd::full<float>({latents.init_latent.shape()[0], latents.init_latent.shape()[1], latents.init_latent.shape()[2], 1, 1}, 1.f);
|
latents.denoise_mask = sd::full<float>({latents.init_latent.shape()[0], latents.init_latent.shape()[1], latents.init_latent.shape()[2], 1, 1}, 1.f);
|
||||||
sd::ops::fill_slice(&latents.denoise_mask, 2, 0, init_image_latent.shape()[2], 0.0f);
|
sd::ops::fill_slice(&latents.denoise_mask, 2, 0, init_image_latent.shape()[2], 0.0f);
|
||||||
|
|
||||||
if (!end_image.empty()) {
|
|
||||||
auto end_img = end_image.reshape({end_image.shape()[0], end_image.shape()[1], 1, end_image.shape()[2], 1});
|
|
||||||
auto end_image_latent = sd_ctx->sd->encode_first_stage(end_img); // [b, c, 1, h/vae_scale_factor, w/vae_scale_factor]
|
|
||||||
if (end_image_latent.empty()) {
|
|
||||||
LOG_ERROR("failed to encode end video frame");
|
|
||||||
return std::nullopt;
|
|
||||||
}
|
|
||||||
sd::ops::slice_assign(&latents.init_latent, 2, latents.init_latent.shape()[2] - 1, latents.init_latent.shape()[2], end_image_latent);
|
|
||||||
sd::ops::fill_slice(&latents.denoise_mask, 2, latents.init_latent.shape()[2] - 1, latents.init_latent.shape()[2], 0.0f);
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t t2 = ggml_time_ms();
|
int64_t t2 = ggml_time_ms();
|
||||||
LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
|
LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
|
||||||
} else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-VACE-1.3B" ||
|
} else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-VACE-1.3B" ||
|
||||||
|
|||||||
@ -25,13 +25,6 @@ void UpscalerGGML::set_max_graph_vram_bytes(size_t max_vram_bytes) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void UpscalerGGML::set_stream_layers_enabled(bool enabled) {
|
|
||||||
stream_layers_enabled = enabled;
|
|
||||||
if (esrgan_upscaler) {
|
|
||||||
esrgan_upscaler->set_stream_layers_enabled(enabled);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
|
bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
|
||||||
bool offload_params_to_cpu,
|
bool offload_params_to_cpu,
|
||||||
int n_threads) {
|
int n_threads) {
|
||||||
@ -83,7 +76,6 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
|
|||||||
tile_size,
|
tile_size,
|
||||||
model_loader.get_tensor_storage_map());
|
model_loader.get_tensor_storage_map());
|
||||||
esrgan_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
esrgan_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
||||||
esrgan_upscaler->set_stream_layers_enabled(stream_layers_enabled);
|
|
||||||
if (direct) {
|
if (direct) {
|
||||||
esrgan_upscaler->set_conv2d_direct_enabled(true);
|
esrgan_upscaler->set_conv2d_direct_enabled(true);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -18,7 +18,6 @@ struct UpscalerGGML {
|
|||||||
bool direct = false;
|
bool direct = false;
|
||||||
int tile_size = 128;
|
int tile_size = 128;
|
||||||
size_t max_graph_vram_bytes = 0;
|
size_t max_graph_vram_bytes = 0;
|
||||||
bool stream_layers_enabled = false;
|
|
||||||
std::string backend_spec;
|
std::string backend_spec;
|
||||||
std::string params_backend_spec;
|
std::string params_backend_spec;
|
||||||
|
|
||||||
@ -32,7 +31,6 @@ struct UpscalerGGML {
|
|||||||
bool offload_params_to_cpu,
|
bool offload_params_to_cpu,
|
||||||
int n_threads);
|
int n_threads);
|
||||||
void set_max_graph_vram_bytes(size_t max_vram_bytes);
|
void set_max_graph_vram_bytes(size_t max_vram_bytes);
|
||||||
void set_stream_layers_enabled(bool enabled);
|
|
||||||
sd::Tensor<float> upscale_tensor(const sd::Tensor<float>& input_tensor);
|
sd::Tensor<float> upscale_tensor(const sd::Tensor<float>& input_tensor);
|
||||||
sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor);
|
sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor);
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user