Compare commits

...

6 Commits

10 changed files with 52 additions and 25 deletions

View File

@ -177,7 +177,7 @@ jobs:
build-and-push-docker-images:
name: Build and push container images
if: ${{ github.event_name != 'pull_request' }}
runs-on: ubuntu-latest
runs-on: ${{ matrix.runner }}
permissions:
contents: read
@ -189,6 +189,20 @@ jobs:
strategy:
matrix:
variant: [musa, sycl, vulkan, cuda]
platform: [linux/amd64]
runner: [ubuntu-latest]
build-args: [""]
tag-suffix: [""]
include:
- variant: cuda
platform: linux/arm64
runner: ubuntu-24.04-arm
tag-suffix: "-spark"
build-args: |
CUDA_VERSION=13.0.0
UBUNTU_VERSION=24.04
CUDA_ARCHITECTURES=121
GGML_CUDA_FA_ALL_QUANTS=ON
env:
REGISTRY: ghcr.io
@ -243,12 +257,13 @@ jobs:
uses: docker/build-push-action@v6
with:
context: .
platforms: linux/amd64
platforms: ${{ matrix.platform }}
push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
file: Dockerfile.${{ matrix.variant }}
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}${{ matrix.tag-suffix }}
labels: ${{ steps.meta.outputs.labels }}
annotations: ${{ steps.meta.outputs.annotations }}
build-args: ${{ matrix.build-args }}
macOS-latest-cmake:
runs-on: macos-latest
@ -449,7 +464,7 @@ jobs:
env:
ROCM_VERSION: "7.13.0"
GPU_TARGETS: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
GPU_TARGETS: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1152;gfx1200;gfx1201"
steps:
- uses: actions/checkout@v3
@ -648,7 +663,7 @@ jobs:
gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
build: 'x64'
- ROCM_VERSION: "7.13.0"
gpu_targets: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
gpu_targets: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1152;gfx1200;gfx1201"
build: x64
steps:

View File

@ -10,7 +10,13 @@ WORKDIR /sd.cpp
COPY . .
ARG CUDACXX=/usr/local/cuda/bin/nvcc
RUN cmake . -B ./build -DSD_CUDA=ON
ARG CUDA_ARCHITECTURES=""
ARG GGML_CUDA_FA_ALL_QUANTS=""
RUN cmake . -B ./build \
-DSD_CUDA=ON \
${CUDA_ARCHITECTURES:+-DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHITECTURES}"} \
${GGML_CUDA_FA_ALL_QUANTS:+-DGGML_CUDA_FA_ALL_QUANTS=${GGML_CUDA_FA_ALL_QUANTS}}
RUN cmake --build ./build --config Release -j$(nproc)
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime

View File

@ -55,7 +55,8 @@ Context Options:
then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting; -1 auto-detects free VRAM minus 1 GiB
graph splitting; a negative value auto-detects free VRAM, sparing the
specified value (e.g. -0.5 will keep at least 0.5 GiB free)
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed

View File

@ -413,7 +413,7 @@ ArgOptions SDContextParams::get_options() {
options.float_options = {
{"",
"--max-vram",
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB",
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value (e.g. -0.5 will keep at least 0.5 GiB free)",
&max_vram},
};

View File

@ -157,7 +157,8 @@ Context Options:
then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting; -1 auto-detects free VRAM minus 1 GiB
graph splitting; a negative value auto-detects free VRAM, sparing the
specified value (e.g. -0.5 will keep at least 0.5 GiB free)
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed

2
ggml

@ -1 +1 @@
Subproject commit 7f4ab364b2843921e795d6890d0f42dd5e5d6b63
Subproject commit 0ce7ad348a3151e1da9f65d962044546bcaad421

View File

@ -126,7 +126,8 @@ enum sd_type_t {
// SD_TYPE_IQ4_NL_8_8 = 38,
SD_TYPE_MXFP4 = 39, // MXFP4 (1 block)
SD_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
SD_TYPE_COUNT = 41,
SD_TYPE_Q1_0 = 41,
SD_TYPE_COUNT = 42,
};
enum sd_log_level_t {

View File

@ -17,7 +17,6 @@
namespace sd::ggml_graph_cut {
static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0;
static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL;
static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
if (tensor == nullptr) {
@ -93,45 +92,47 @@ namespace sd::ggml_graph_cut {
return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
}
static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) {
static size_t resolve_auto_max_vram_bytes(float spare_vram, ggml_backend_t backend) {
if (backend == nullptr) {
LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting");
LOG_WARN("--max-vram < 0 requested, but no backend is available; disabling graph splitting");
return 0;
}
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
if (dev == nullptr) {
LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting");
LOG_WARN("--max-vram < 0 requested, but no backend device is available; disabling graph splitting");
return 0;
}
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting");
LOG_WARN("--max-vram < 0 requested, but the main backend is CPU; disabling graph splitting");
return 0;
}
size_t free_vram = 0;
size_t total_vram = 0;
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
size_t spare_bytes = static_cast<size_t>(MAX_VRAM_BYTES_PER_GIB * spare_vram);
if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) {
LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget",
free_vram / MAX_VRAM_BYTES_PER_GIB);
if (free_vram <= spare_bytes) {
LOG_WARN("--max-vram < 0 requested, but free VRAM is %.2f GiB; reserving %.2f GiB leaves no graph budget",
free_vram / MAX_VRAM_BYTES_PER_GIB, spare_vram);
return 0;
}
const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES;
LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB",
const size_t max_vram_bytes = free_vram - spare_bytes;
LOG_INFO("--max-vram < 0 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving %.2f GiB; using %.2f GiB",
free_vram / MAX_VRAM_BYTES_PER_GIB,
total_vram / MAX_VRAM_BYTES_PER_GIB,
spare_vram,
max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
return max_vram_bytes;
}
float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
if (max_vram != -1.f) {
if (max_vram >= 0.f) {
return max_vram;
}
return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend));
return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(-max_vram, backend));
}
static Segment make_segment_seed(const Plan& plan,

View File

@ -1403,7 +1403,8 @@ namespace LLM {
out_layers,
return_all_hidden_states);
};
return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true),
input_ids.dim() + 1);
}
int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) {

View File

@ -327,8 +327,9 @@ public:
LOG_INFO("loading tae from '%s'", sd_ctx_params->taesd_path);
if (!model_loader.init_from_file(sd_ctx_params->taesd_path, "tae.")) {
LOG_WARN("loading tae from '%s' failed", sd_ctx_params->taesd_path);
} else {
use_tae = true;
}
use_tae = true;
}
if (strlen(SAFE_STR(sd_ctx_params->embeddings_connectors_path)) > 0) {