mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-25 07:36:38 +00:00
Compare commits
6 Commits
830804262b
...
caa823a8c0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
caa823a8c0 | ||
|
|
22c8c40b0d | ||
|
|
b706d682ad | ||
|
|
b758b7de13 | ||
|
|
f683c88a28 | ||
|
|
21fd4e6788 |
25
.github/workflows/build.yml
vendored
25
.github/workflows/build.yml
vendored
@ -177,7 +177,7 @@ jobs:
|
|||||||
build-and-push-docker-images:
|
build-and-push-docker-images:
|
||||||
name: Build and push container images
|
name: Build and push container images
|
||||||
if: ${{ github.event_name != 'pull_request' }}
|
if: ${{ github.event_name != 'pull_request' }}
|
||||||
runs-on: ubuntu-latest
|
runs-on: ${{ matrix.runner }}
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
@ -189,6 +189,20 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
variant: [musa, sycl, vulkan, cuda]
|
variant: [musa, sycl, vulkan, cuda]
|
||||||
|
platform: [linux/amd64]
|
||||||
|
runner: [ubuntu-latest]
|
||||||
|
build-args: [""]
|
||||||
|
tag-suffix: [""]
|
||||||
|
include:
|
||||||
|
- variant: cuda
|
||||||
|
platform: linux/arm64
|
||||||
|
runner: ubuntu-24.04-arm
|
||||||
|
tag-suffix: "-spark"
|
||||||
|
build-args: |
|
||||||
|
CUDA_VERSION=13.0.0
|
||||||
|
UBUNTU_VERSION=24.04
|
||||||
|
CUDA_ARCHITECTURES=121
|
||||||
|
GGML_CUDA_FA_ALL_QUANTS=ON
|
||||||
|
|
||||||
env:
|
env:
|
||||||
REGISTRY: ghcr.io
|
REGISTRY: ghcr.io
|
||||||
@ -243,12 +257,13 @@ jobs:
|
|||||||
uses: docker/build-push-action@v6
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
platforms: linux/amd64
|
platforms: ${{ matrix.platform }}
|
||||||
push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
file: Dockerfile.${{ matrix.variant }}
|
file: Dockerfile.${{ matrix.variant }}
|
||||||
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}
|
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}${{ matrix.tag-suffix }}
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
annotations: ${{ steps.meta.outputs.annotations }}
|
annotations: ${{ steps.meta.outputs.annotations }}
|
||||||
|
build-args: ${{ matrix.build-args }}
|
||||||
|
|
||||||
macOS-latest-cmake:
|
macOS-latest-cmake:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
@ -449,7 +464,7 @@ jobs:
|
|||||||
|
|
||||||
env:
|
env:
|
||||||
ROCM_VERSION: "7.13.0"
|
ROCM_VERSION: "7.13.0"
|
||||||
GPU_TARGETS: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
|
GPU_TARGETS: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1152;gfx1200;gfx1201"
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
@ -648,7 +663,7 @@ jobs:
|
|||||||
gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
|
gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
|
||||||
build: 'x64'
|
build: 'x64'
|
||||||
- ROCM_VERSION: "7.13.0"
|
- ROCM_VERSION: "7.13.0"
|
||||||
gpu_targets: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
|
gpu_targets: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1152;gfx1200;gfx1201"
|
||||||
build: x64
|
build: x64
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
@ -10,7 +10,13 @@ WORKDIR /sd.cpp
|
|||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
ARG CUDACXX=/usr/local/cuda/bin/nvcc
|
ARG CUDACXX=/usr/local/cuda/bin/nvcc
|
||||||
RUN cmake . -B ./build -DSD_CUDA=ON
|
ARG CUDA_ARCHITECTURES=""
|
||||||
|
ARG GGML_CUDA_FA_ALL_QUANTS=""
|
||||||
|
|
||||||
|
RUN cmake . -B ./build \
|
||||||
|
-DSD_CUDA=ON \
|
||||||
|
${CUDA_ARCHITECTURES:+-DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHITECTURES}"} \
|
||||||
|
${GGML_CUDA_FA_ALL_QUANTS:+-DGGML_CUDA_FA_ALL_QUANTS=${GGML_CUDA_FA_ALL_QUANTS}}
|
||||||
RUN cmake --build ./build --config Release -j$(nproc)
|
RUN cmake --build ./build --config Release -j$(nproc)
|
||||||
|
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
|
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
|
||||||
|
|||||||
@ -55,7 +55,8 @@ Context Options:
|
|||||||
then threads will be set to the number of CPU physical cores
|
then threads will be set to the number of CPU physical cores
|
||||||
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
|
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
|
||||||
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
|
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
|
||||||
graph splitting; -1 auto-detects free VRAM minus 1 GiB
|
graph splitting; a negative value auto-detects free VRAM, sparing the
|
||||||
|
specified value (e.g. -0.5 will keep at least 0.5 GiB free)
|
||||||
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
|
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
|
||||||
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
|
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
|
||||||
when needed
|
when needed
|
||||||
|
|||||||
@ -413,7 +413,7 @@ ArgOptions SDContextParams::get_options() {
|
|||||||
options.float_options = {
|
options.float_options = {
|
||||||
{"",
|
{"",
|
||||||
"--max-vram",
|
"--max-vram",
|
||||||
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB",
|
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value (e.g. -0.5 will keep at least 0.5 GiB free)",
|
||||||
&max_vram},
|
&max_vram},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -157,7 +157,8 @@ Context Options:
|
|||||||
then threads will be set to the number of CPU physical cores
|
then threads will be set to the number of CPU physical cores
|
||||||
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
|
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
|
||||||
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
|
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
|
||||||
graph splitting; -1 auto-detects free VRAM minus 1 GiB
|
graph splitting; a negative value auto-detects free VRAM, sparing the
|
||||||
|
specified value (e.g. -0.5 will keep at least 0.5 GiB free)
|
||||||
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
|
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
|
||||||
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
|
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
|
||||||
when needed
|
when needed
|
||||||
|
|||||||
2
ggml
2
ggml
@ -1 +1 @@
|
|||||||
Subproject commit 7f4ab364b2843921e795d6890d0f42dd5e5d6b63
|
Subproject commit 0ce7ad348a3151e1da9f65d962044546bcaad421
|
||||||
@ -126,7 +126,8 @@ enum sd_type_t {
|
|||||||
// SD_TYPE_IQ4_NL_8_8 = 38,
|
// SD_TYPE_IQ4_NL_8_8 = 38,
|
||||||
SD_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
SD_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
||||||
SD_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
|
SD_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
|
||||||
SD_TYPE_COUNT = 41,
|
SD_TYPE_Q1_0 = 41,
|
||||||
|
SD_TYPE_COUNT = 42,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum sd_log_level_t {
|
enum sd_log_level_t {
|
||||||
|
|||||||
@ -17,7 +17,6 @@
|
|||||||
namespace sd::ggml_graph_cut {
|
namespace sd::ggml_graph_cut {
|
||||||
|
|
||||||
static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0;
|
static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0;
|
||||||
static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL;
|
|
||||||
|
|
||||||
static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
|
static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
|
||||||
if (tensor == nullptr) {
|
if (tensor == nullptr) {
|
||||||
@ -93,45 +92,47 @@ namespace sd::ggml_graph_cut {
|
|||||||
return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
|
return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) {
|
static size_t resolve_auto_max_vram_bytes(float spare_vram, ggml_backend_t backend) {
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting");
|
LOG_WARN("--max-vram < 0 requested, but no backend is available; disabling graph splitting");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
|
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
|
||||||
if (dev == nullptr) {
|
if (dev == nullptr) {
|
||||||
LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting");
|
LOG_WARN("--max-vram < 0 requested, but no backend device is available; disabling graph splitting");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||||
LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting");
|
LOG_WARN("--max-vram < 0 requested, but the main backend is CPU; disabling graph splitting");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t free_vram = 0;
|
size_t free_vram = 0;
|
||||||
size_t total_vram = 0;
|
size_t total_vram = 0;
|
||||||
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
|
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
|
||||||
|
size_t spare_bytes = static_cast<size_t>(MAX_VRAM_BYTES_PER_GIB * spare_vram);
|
||||||
|
|
||||||
if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) {
|
if (free_vram <= spare_bytes) {
|
||||||
LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget",
|
LOG_WARN("--max-vram < 0 requested, but free VRAM is %.2f GiB; reserving %.2f GiB leaves no graph budget",
|
||||||
free_vram / MAX_VRAM_BYTES_PER_GIB);
|
free_vram / MAX_VRAM_BYTES_PER_GIB, spare_vram);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES;
|
const size_t max_vram_bytes = free_vram - spare_bytes;
|
||||||
LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB",
|
LOG_INFO("--max-vram < 0 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving %.2f GiB; using %.2f GiB",
|
||||||
free_vram / MAX_VRAM_BYTES_PER_GIB,
|
free_vram / MAX_VRAM_BYTES_PER_GIB,
|
||||||
total_vram / MAX_VRAM_BYTES_PER_GIB,
|
total_vram / MAX_VRAM_BYTES_PER_GIB,
|
||||||
|
spare_vram,
|
||||||
max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
|
max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
|
||||||
return max_vram_bytes;
|
return max_vram_bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
|
float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
|
||||||
if (max_vram != -1.f) {
|
if (max_vram >= 0.f) {
|
||||||
return max_vram;
|
return max_vram;
|
||||||
}
|
}
|
||||||
return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend));
|
return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(-max_vram, backend));
|
||||||
}
|
}
|
||||||
|
|
||||||
static Segment make_segment_seed(const Plan& plan,
|
static Segment make_segment_seed(const Plan& plan,
|
||||||
|
|||||||
@ -1403,7 +1403,8 @@ namespace LLM {
|
|||||||
out_layers,
|
out_layers,
|
||||||
return_all_hidden_states);
|
return_all_hidden_states);
|
||||||
};
|
};
|
||||||
return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
|
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true),
|
||||||
|
input_ids.dim() + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) {
|
int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) {
|
||||||
|
|||||||
@ -327,8 +327,9 @@ public:
|
|||||||
LOG_INFO("loading tae from '%s'", sd_ctx_params->taesd_path);
|
LOG_INFO("loading tae from '%s'", sd_ctx_params->taesd_path);
|
||||||
if (!model_loader.init_from_file(sd_ctx_params->taesd_path, "tae.")) {
|
if (!model_loader.init_from_file(sd_ctx_params->taesd_path, "tae.")) {
|
||||||
LOG_WARN("loading tae from '%s' failed", sd_ctx_params->taesd_path);
|
LOG_WARN("loading tae from '%s' failed", sd_ctx_params->taesd_path);
|
||||||
|
} else {
|
||||||
|
use_tae = true;
|
||||||
}
|
}
|
||||||
use_tae = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strlen(SAFE_STR(sd_ctx_params->embeddings_connectors_path)) > 0) {
|
if (strlen(SAFE_STR(sd_ctx_params->embeddings_connectors_path)) > 0) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user