ci: add RDNA1 + RDNA2 targets for ROCm 7.13 (#1511 )

sync: update ggml (#1520 )
fix: restore singleton dims for LLM outputs (#1518 )
2026-06-24 23:26:43 +00:00 · 2026-05-19 01:38:02 +08:00 · 2026-05-19 01:30:11 +08:00 · 2026-05-18 23:47:10 +08:00 · 2026-05-18 23:32:03 +08:00 · 2026-05-18 23:00:06 +08:00
10 changed files with 52 additions and 25 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -177,7 +177,7 @@ jobs:
  build-and-push-docker-images:
    name: Build and push container images
    if: ${{ github.event_name != 'pull_request' }}
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.runner }}

    permissions:
      contents: read
@ -189,6 +189,20 @@ jobs:
    strategy:
      matrix:
        variant: [musa, sycl, vulkan, cuda]
+        platform: [linux/amd64]
+        runner: [ubuntu-latest]
+        build-args: [""]
+        tag-suffix: [""]
+        include:
+          - variant: cuda
+            platform: linux/arm64
+            runner: ubuntu-24.04-arm
+            tag-suffix: "-spark"
+            build-args: |
+              CUDA_VERSION=13.0.0
+              UBUNTU_VERSION=24.04
+              CUDA_ARCHITECTURES=121
+              GGML_CUDA_FA_ALL_QUANTS=ON

    env:
      REGISTRY: ghcr.io
@ -243,12 +257,13 @@ jobs:
        uses: docker/build-push-action@v6
        with:
          context: .
-          platforms: linux/amd64
+          platforms: ${{ matrix.platform }}
          push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
          file: Dockerfile.${{ matrix.variant }}
-          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}
+          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}${{ matrix.tag-suffix }}
          labels: ${{ steps.meta.outputs.labels }}
          annotations: ${{ steps.meta.outputs.annotations }}
+          build-args: ${{ matrix.build-args }}

  macOS-latest-cmake:
    runs-on: macos-latest
@ -449,7 +464,7 @@ jobs:

    env:
      ROCM_VERSION: "7.13.0"
-      GPU_TARGETS: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
+      GPU_TARGETS: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1152;gfx1200;gfx1201"

    steps:
      - uses: actions/checkout@v3
@ -648,7 +663,7 @@ jobs:
            gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
            build: 'x64'
          - ROCM_VERSION: "7.13.0"
-            gpu_targets: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
+            gpu_targets: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1152;gfx1200;gfx1201"
            build: x64

    steps:
--- a/Dockerfile.cuda
+++ b/Dockerfile.cuda
@ -10,7 +10,13 @@ WORKDIR /sd.cpp
 COPY . .

 ARG CUDACXX=/usr/local/cuda/bin/nvcc
-RUN cmake . -B ./build -DSD_CUDA=ON
+ARG CUDA_ARCHITECTURES=""
+ARG GGML_CUDA_FA_ALL_QUANTS=""
+
+RUN cmake . -B ./build \
+    -DSD_CUDA=ON \
+    ${CUDA_ARCHITECTURES:+-DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHITECTURES}"} \
+    ${GGML_CUDA_FA_ALL_QUANTS:+-DGGML_CUDA_FA_ALL_QUANTS=${GGML_CUDA_FA_ALL_QUANTS}}
 RUN cmake --build ./build --config Release -j$(nproc)

 FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -55,7 +55,8 @@ Context Options:
                                           then threads will be set to the number of CPU physical cores
  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
  --max-vram <float>                       maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
-                                           graph splitting; -1 auto-detects free VRAM minus 1 GiB
+                                           graph splitting; a negative value auto-detects free VRAM, sparing the
+                                           specified value (e.g. -0.5 will keep at least 0.5 GiB free)
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
                                           when needed
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@ -413,7 +413,7 @@ ArgOptions SDContextParams::get_options() {
    options.float_options = {
        {"",
         "--max-vram",
-         "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB",
+         "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value (e.g. -0.5 will keep at least 0.5 GiB free)",
         &max_vram},
    };

--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -157,7 +157,8 @@ Context Options:
                                           then threads will be set to the number of CPU physical cores
  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
  --max-vram <float>                       maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
-                                           graph splitting; -1 auto-detects free VRAM minus 1 GiB
+                                           graph splitting; a negative value auto-detects free VRAM, sparing the
+                                           specified value (e.g. -0.5 will keep at least 0.5 GiB free)
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
                                           when needed
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 7f4ab364b2843921e795d6890d0f42dd5e5d6b63
+Subproject commit 0ce7ad348a3151e1da9f65d962044546bcaad421
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -126,7 +126,8 @@ enum sd_type_t {
    // SD_TYPE_IQ4_NL_8_8 = 38,
    SD_TYPE_MXFP4 = 39,  // MXFP4 (1 block)
    SD_TYPE_NVFP4 = 40,  // NVFP4 (4 blocks, E4M3 scale)
-    SD_TYPE_COUNT = 41,
+    SD_TYPE_Q1_0  = 41,
+    SD_TYPE_COUNT = 42,
 };

 enum sd_log_level_t {
--- a/src/ggml_graph_cut.cpp
+++ b/src/ggml_graph_cut.cpp
@ -17,7 +17,6 @@
 namespace sd::ggml_graph_cut {

    static constexpr double MAX_VRAM_BYTES_PER_GIB      = 1024.0 * 1024.0 * 1024.0;
-    static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL;

    static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
        if (tensor == nullptr) {
@ -93,45 +92,47 @@ namespace sd::ggml_graph_cut {
        return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
    }

-    static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) {
+    static size_t resolve_auto_max_vram_bytes(float spare_vram, ggml_backend_t backend) {
        if (backend == nullptr) {
-            LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting");
+            LOG_WARN("--max-vram < 0 requested, but no backend is available; disabling graph splitting");
            return 0;
        }

        ggml_backend_dev_t dev = ggml_backend_get_device(backend);
        if (dev == nullptr) {
-            LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting");
+            LOG_WARN("--max-vram < 0 requested, but no backend device is available; disabling graph splitting");
            return 0;
        }
        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
-            LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting");
+            LOG_WARN("--max-vram < 0 requested, but the main backend is CPU; disabling graph splitting");
            return 0;
        }

        size_t free_vram  = 0;
        size_t total_vram = 0;
        ggml_backend_dev_memory(dev, &free_vram, &total_vram);
+        size_t spare_bytes = static_cast<size_t>(MAX_VRAM_BYTES_PER_GIB * spare_vram);

-        if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) {
-            LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget",
-                     free_vram / MAX_VRAM_BYTES_PER_GIB);
+        if (free_vram <= spare_bytes) {
+            LOG_WARN("--max-vram < 0 requested, but free VRAM is %.2f GiB; reserving %.2f GiB leaves no graph budget",
+                     free_vram / MAX_VRAM_BYTES_PER_GIB, spare_vram);
            return 0;
        }

-        const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES;
-        LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB",
+        const size_t max_vram_bytes = free_vram - spare_bytes;
+        LOG_INFO("--max-vram < 0 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving %.2f GiB; using %.2f GiB",
                 free_vram / MAX_VRAM_BYTES_PER_GIB,
                 total_vram / MAX_VRAM_BYTES_PER_GIB,
+                 spare_vram,
                 max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
        return max_vram_bytes;
    }

    float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
-        if (max_vram != -1.f) {
+        if (max_vram >= 0.f) {
            return max_vram;
        }
-        return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend));
+        return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(-max_vram, backend));
    }

    static Segment make_segment_seed(const Plan& plan,
--- a/src/llm.hpp
+++ b/src/llm.hpp
@ -1403,7 +1403,8 @@ namespace LLM {
                                   out_layers,
                                   return_all_hidden_states);
            };
-            return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true),
+                                                   input_ids.dim() + 1);
        }

        int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) {
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -327,8 +327,9 @@ public:
            LOG_INFO("loading tae from '%s'", sd_ctx_params->taesd_path);
            if (!model_loader.init_from_file(sd_ctx_params->taesd_path, "tae.")) {
                LOG_WARN("loading tae from '%s' failed", sd_ctx_params->taesd_path);
+            } else {
+                use_tae = true;
            }
-            use_tae = true;
        }

        if (strlen(SAFE_STR(sd_ctx_params->embeddings_connectors_path)) > 0) {
Author	SHA1	Message	Date
George Sofianos	caa823a8c0	ci: add RDNA1 + RDNA2 targets for ROCm 7.13 (#1511 )	2026-05-19 01:38:02 +08:00
leejet	22c8c40b0d	sync: update ggml (#1520 )	2026-05-19 01:30:11 +08:00
leejet	b706d682ad	fix: restore singleton dims for LLM outputs (#1518 )	2026-05-18 23:47:10 +08:00
leejet	b758b7de13	fix: only enable TAE after successful load (#1517 )	2026-05-18 23:32:03 +08:00
Wagner Bruna	f683c88a28	feat: make negative max_vram control the amount of spare vram (#1503 )	2026-05-18 23:00:06 +08:00
Christoph	21fd4e6788	ci: add CUDA Docker image support for NVIDIA Spark GB10 (#1512 )	2026-05-18 22:52:01 +08:00