feat: add extra_c_crossattns support for llm embedder (#1265 )

fix: avoid unwanted file extension changes (#1257 )
fix: correct sdapi handling of cfg_scale and steps (#1260 )
2026-06-25 15:46:40 +00:00 · 2026-02-10 00:00:17 +08:00 · 2026-02-09 23:59:43 +08:00 · 2026-02-09 23:34:19 +08:00 · 2026-02-09 23:29:41 +08:00 · 2026-02-09 23:23:06 +08:00
7 changed files with 425 additions and 200 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -485,6 +485,153 @@ jobs:
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip

+  ubuntu-latest-rocm:
+    runs-on: ubuntu-latest
+    container: rocm/dev-ubuntu-24.04:7.2
+
+    env:
+      ROCM_VERSION: "7.2"
+      UBUNTU_VERSION: "24.04"
+      GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+
+    steps:
+      - run: apt-get update && apt-get install -y git
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          submodules: recursive
+
+      - name: Free disk space
+        run: |
+          # Remove preinstalled SDKs and caches not needed for this job
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /usr/local/lib/android || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf /usr/local/.ghcup || true
+          sudo rm -rf /opt/hostedtoolcache || true
+
+          # Remove old package lists and caches
+          sudo rm -rf /var/lib/apt/lists/* || true
+          sudo apt clean
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt install -y \
+            cmake \
+            hip-dev \
+            hipblas-dev \
+            ninja-build \
+            rocm-dev \
+            zip
+          # Clean apt caches to recover disk space
+          sudo apt clean
+          sudo rm -rf /var/lib/apt/lists/* || true
+
+      - name: Setup ROCm Environment
+        run: |
+          # Add ROCm to PATH for current session
+          echo "/opt/rocm/bin" >> $GITHUB_PATH
+
+          # Build case pattern from GPU_TARGETS
+          PATTERN=$(printf '%s' "$GPU_TARGETS" | sed 's/;/\*|\*/g')
+          PATTERN="*${PATTERN}*"
+
+          # Remove library files for architectures we're not building for to save disk space
+          echo "Cleaning up unneeded architecture files..."
+          cd /opt/rocm/lib/rocblas/library
+          # Keep only our target architectures
+          for file in *; do
+            case "$file" in
+            $PATTERN)
+              ;;
+            *)
+              sudo rm -f "$file" ;;
+            esac;
+          done
+
+          cd /opt/rocm/lib/hipblaslt/library
+          for file in *; do
+            case "$file" in
+              $PATTERN)
+                ;;
+              *)
+                sudo rm -f "$file" ;;
+            esac;
+          done
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -G Ninja \
+            -DCMAKE_CXX_COMPILER=amdclang++ \
+            -DCMAKE_C_COMPILER=amdclang \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DSD_HIPBLAS=ON \
+            -DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
+            -DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+            -DSD_BUILD_SHARED_LIBS=ON
+          cmake --build . --config Release
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Prepare artifacts
+        id: prepare_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          # Copy licenses
+          cp ggml/LICENSE ./build/bin/ggml.txt
+          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+
+          # Create directories for ROCm libraries
+          mkdir -p ./build/bin/rocblas/library
+          mkdir -p ./build/bin/hipblaslt/library
+
+          # Copy ROCm runtime libraries (use || true to continue if files don't exist)
+          cp /opt/rocm/lib/librocsparse.so* ./build/bin/ || true
+          cp /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/ || true
+          cp /opt/rocm/lib/libamdhip64.so* ./build/bin/ || true
+          cp /opt/rocm/lib/libhipblas.so* ./build/bin/ || true
+          cp /opt/rocm/lib/libhipblaslt.so* ./build/bin/ || true
+          cp /opt/rocm/lib/librocblas.so* ./build/bin/ || true
+
+          # Copy library files (already filtered to target architectures)
+          cp /opt/rocm/lib/rocblas/library/* ./build/bin/rocblas/library/ || true
+          cp /opt/rocm/lib/hipblaslt/library/* ./build/bin/hipblaslt/library/ || true
+
+      - name: Fetch system info
+        id: system-info
+        run: |
+          echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
+          echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
+          echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
+          echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp ggml/LICENSE ./build/bin/ggml.txt
+          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+          zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+          path: |
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@ -493,6 +640,7 @@ jobs:
    needs:
      - ubuntu-latest-cmake
      - ubuntu-latest-cmake-vulkan
+      - ubuntu-latest-rocm
      - build-and-push-docker-images
      - macOS-latest-cmake
      - windows-latest-cmake
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -10,9 +10,14 @@ struct SDCondition {
    struct ggml_tensor* c_vector    = nullptr;  // aka y
    struct ggml_tensor* c_concat    = nullptr;

+    std::vector<struct ggml_tensor*> extra_c_crossattns;
+
    SDCondition() = default;
-    SDCondition(struct ggml_tensor* c_crossattn, struct ggml_tensor* c_vector, struct ggml_tensor* c_concat)
-        : c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat) {}
+    SDCondition(struct ggml_tensor* c_crossattn,
+                struct ggml_tensor* c_vector,
+                struct ggml_tensor* c_concat,
+                const std::vector<struct ggml_tensor*>& extra_c_crossattns = {})
+        : c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat), extra_c_crossattns(extra_c_crossattns) {}
 };

 struct ConditionerParams {
@ -1696,10 +1701,11 @@ struct LLMEmbedder : public Conditioner {
    }

    std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
-                                                              std::pair<int, int> attn_range,
+                                                              const std::pair<int, int>& attn_range,
                                                              size_t max_length = 0,
                                                              bool padding      = false) {
        std::vector<std::pair<std::string, float>> parsed_attention;
+        if (attn_range.first >= 0 && attn_range.second > 0) {
            parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
            if (attn_range.second - attn_range.first > 0) {
                auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
@ -1708,6 +1714,10 @@ struct LLMEmbedder : public Conditioner {
                                        new_parsed_attention.end());
            }
            parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
+        } else {
+            parsed_attention.emplace_back(text, 1.f);
+        }
+
        {
            std::stringstream ss;
            ss << "[";
@ -1738,19 +1748,110 @@ struct LLMEmbedder : public Conditioner {
        return {tokens, weights};
    }

+    ggml_tensor* encode_prompt(ggml_context* work_ctx,
+                               int n_threads,
+                               const std::string prompt,
+                               const std::pair<int, int>& prompt_attn_range,
+                               int max_length,
+                               int min_length,
+                               std::vector<std::pair<int, ggml_tensor*>> image_embeds,
+                               const std::set<int>& out_layers,
+                               int prompt_template_encode_start_idx) {
+        auto tokens_and_weights = tokenize(prompt, prompt_attn_range);
+        auto& tokens            = std::get<0>(tokens_and_weights);
+        auto& weights           = std::get<1>(tokens_and_weights);
+        std::vector<float> mask;
+
+        if (max_length > 0 && tokens.size() < max_length) {
+            mask.insert(mask.end(), tokens.size(), 1.f);
+            mask.insert(mask.end(), max_length - tokens.size(), 0.f);
+            tokenizer->pad_tokens(tokens, weights, max_length, true);
+        }
+
+        struct ggml_tensor* hidden_states = nullptr;  // [N, n_token, hidden_size]
+
+        auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
+
+        ggml_tensor* attention_mask = nullptr;
+        if (!mask.empty()) {
+            attention_mask = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, mask.size(), mask.size());
+            ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* attention_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+                float value = 0.f;
+                if (mask[i0] == 0.f) {
+                    value = -INFINITY;
+                } else if (i0 > i1) {
+                    value = -INFINITY;
+                }
+                ggml_ext_tensor_set_f32(attention_mask, value, i0, i1, i2, i3);
+            });
+        }
+
+        llm->compute(n_threads,
+                     input_ids,
+                     attention_mask,
+                     image_embeds,
+                     out_layers,
+                     &hidden_states,
+                     work_ctx);
+        {
+            auto tensor         = hidden_states;
+            float original_mean = ggml_ext_tensor_mean(tensor);
+            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                        float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
+                        value *= weights[i1];
+                        ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
+                    }
+                }
+            }
+            float new_mean = ggml_ext_tensor_mean(tensor);
+            ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
+        }
+
+        GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
+
+        int64_t zero_pad_len = 0;
+        if (min_length > 0) {
+            if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
+                zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx;
+            }
+        }
+
+        ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
+                                                            GGML_TYPE_F32,
+                                                            hidden_states->ne[0],
+                                                            hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len,
+                                                            hidden_states->ne[2]);
+
+        ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+            float value = 0.f;
+            if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) {
+                value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
+            }
+            ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
+        });
+
+        return new_hidden_states;
+    }
+
    SDCondition get_learned_condition(ggml_context* work_ctx,
                                      int n_threads,
                                      const ConditionerParams& conditioner_params) override {
        std::string prompt;
-        std::vector<std::pair<int, ggml_tensor*>> image_embeds;
        std::pair<int, int> prompt_attn_range;
+        std::vector<std::string> extra_prompts;
+        std::vector<std::pair<int, int>> extra_prompts_attn_range;
+        std::vector<std::pair<int, ggml_tensor*>> image_embeds;
        int prompt_template_encode_start_idx = 34;
-        int max_length                       = 0;
+        int max_length                       = 0;  // pad tokens
+        int min_length                       = 0;  // zero pad hidden_states
        std::set<int> out_layers;
-        std::vector<int> tokens;
-        std::vector<float> weights;
-        std::vector<float> mask;
-        if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
+
+        int64_t t0 = ggml_time_ms();
+
+        if (sd_version_is_qwen_image(version)) {
+            if (llm->enable_vision && !conditioner_params.ref_images.empty()) {
                LOG_INFO("QwenImageEditPlusPipeline");
                prompt_template_encode_start_idx = 64;
                int image_embed_idx              = 64 + 6;
@ -1813,8 +1914,20 @@ struct LLMEmbedder : public Conditioner {
                prompt_attn_range.second = static_cast<int>(prompt.size());

                prompt += "<|im_end|>\n<|im_start|>assistant\n";
+            } else {
+                prompt_template_encode_start_idx = 34;
+
+                prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
+
+                prompt_attn_range.first = static_cast<int>(prompt.size());
+                prompt += conditioner_params.text;
+                prompt_attn_range.second = static_cast<int>(prompt.size());
+
+                prompt += "<|im_end|>\n<|im_start|>assistant\n";
+            }
        } else if (version == VERSION_FLUX2) {
            prompt_template_encode_start_idx = 0;
+            min_length                       = 512;
            out_layers                       = {10, 20, 30};

            prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
@ -1828,6 +1941,15 @@ struct LLMEmbedder : public Conditioner {
            prompt_template_encode_start_idx = 0;
            out_layers                       = {35};  // -2

+            if (!conditioner_params.ref_images.empty()) {
+                LOG_INFO("ZImageOmniPipeline");
+                prompt = "<|im_start|>user\n<|vision_start|>";
+                for (int i = 0; i < conditioner_params.ref_images.size() - 1; i++) {
+                    extra_prompts.push_back("<|vision_end|><|vision_start|>");
+                }
+                extra_prompts.push_back("<|vision_end|>" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n<|vision_start|>");
+                extra_prompts.push_back("<|vision_end|><|im_end|>");
+            } else {
                prompt = "<|im_start|>user\n";

                prompt_attn_range.first = static_cast<int>(prompt.size());
@ -1835,6 +1957,7 @@ struct LLMEmbedder : public Conditioner {
                prompt_attn_range.second = static_cast<int>(prompt.size());

                prompt += "<|im_end|>\n<|im_start|>assistant\n";
+            }
        } else if (version == VERSION_FLUX2_KLEIN) {
            prompt_template_encode_start_idx = 0;
            max_length                       = 512;
@ -1847,16 +1970,6 @@ struct LLMEmbedder : public Conditioner {
            prompt_attn_range.second = static_cast<int>(prompt.size());

            prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
-
-            auto tokens_and_weights = tokenize(prompt, prompt_attn_range, 0, false);
-            tokens                  = std::get<0>(tokens_and_weights);
-            weights                 = std::get<1>(tokens_and_weights);
-
-            mask.insert(mask.end(), tokens.size(), 1.f);
-            if (tokens.size() < max_length) {
-                mask.insert(mask.end(), max_length - tokens.size(), 0.f);
-                tokenizer->pad_tokens(tokens, weights, max_length, true);
-            }
        } else if (version == VERSION_OVIS_IMAGE) {
            prompt_template_encode_start_idx = 28;
            max_length                       = prompt_template_encode_start_idx + 256;
@ -1869,98 +1982,36 @@ struct LLMEmbedder : public Conditioner {

            prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
        } else {
-            prompt_template_encode_start_idx = 34;
-
-            prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
-
-            prompt_attn_range.first = static_cast<int>(prompt.size());
-            prompt += conditioner_params.text;
-            prompt_attn_range.second = static_cast<int>(prompt.size());
-
-            prompt += "<|im_end|>\n<|im_start|>assistant\n";
+            GGML_ABORT("unknown version %d", version);
        }

-        if (tokens.empty()) {
-            auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0);
-            tokens                  = std::get<0>(tokens_and_weights);
-            weights                 = std::get<1>(tokens_and_weights);
-        }
-
-        int64_t t0                        = ggml_time_ms();
-        struct ggml_tensor* hidden_states = nullptr;  // [N, n_token, 3584]
-
-        auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
-
-        ggml_tensor* attention_mask = nullptr;
-        if (!mask.empty()) {
-            attention_mask = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, mask.size(), mask.size());
-            ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* attention_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                float value = 0.f;
-                if (mask[i0] == 0.f) {
-                    value = -INFINITY;
-                } else if (i0 > i1) {
-                    value = -INFINITY;
-                }
-                ggml_ext_tensor_set_f32(attention_mask, value, i0, i1, i2, i3);
-            });
-        }
-
-        llm->compute(n_threads,
-                     input_ids,
-                     attention_mask,
+        auto hidden_states = encode_prompt(work_ctx,
+                                           n_threads,
+                                           prompt,
+                                           prompt_attn_range,
+                                           max_length,
+                                           min_length,
                                           image_embeds,
                                           out_layers,
-                     &hidden_states,
-                     work_ctx);
-        {
-            auto tensor         = hidden_states;
-            float original_mean = ggml_ext_tensor_mean(tensor);
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                        value *= weights[i1];
-                        ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                    }
-                }
-            }
-            float new_mean = ggml_ext_tensor_mean(tensor);
-            ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
-        }
+                                           prompt_template_encode_start_idx);

-        GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
-
-        int64_t min_length = 0;
-        if (version == VERSION_FLUX2) {
-            min_length = 512;
+        std::vector<ggml_tensor*> extra_hidden_states_vec;
+        for (int i = 0; i < extra_prompts.size(); i++) {
+            auto extra_hidden_states = encode_prompt(work_ctx,
+                                                     n_threads,
+                                                     extra_prompts[i],
+                                                     extra_prompts_attn_range[i],
+                                                     max_length,
+                                                     min_length,
+                                                     image_embeds,
+                                                     out_layers,
+                                                     prompt_template_encode_start_idx);
+            extra_hidden_states_vec.push_back(extra_hidden_states);
        }

-        int64_t zero_pad_len = 0;
-        if (min_length > 0) {
-            if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
-                zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx;
-            }
-        }
-
-        ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
-                                                            GGML_TYPE_F32,
-                                                            hidden_states->ne[0],
-                                                            hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len,
-                                                            hidden_states->ne[2]);
-
-        ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = 0.f;
-            if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) {
-                value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
-            }
-            ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
-        });
-
-        // print_ggml_tensor(new_hidden_states);
-
        int64_t t1 = ggml_time_ms();
        LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-        return {new_hidden_states, nullptr, nullptr};
+        return {hidden_states, nullptr, nullptr, extra_hidden_states_vec};
    }
 };

--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -394,12 +394,15 @@ bool save_results(const SDCliParams& cli_params,

    fs::path base_path = out_path;
    fs::path ext       = out_path.has_extension() ? out_path.extension() : fs::path{};
-    if (!ext.empty())
-        base_path.replace_extension();

    std::string ext_lower = ext.string();
    std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower);
    bool is_jpg = (ext_lower == ".jpg" || ext_lower == ".jpeg" || ext_lower == ".jpe");
+    if (!ext.empty()) {
+        if (is_jpg || ext_lower == ".png") {
+            base_path.replace_extension();
+        }
+    }

    int output_begin_idx = cli_params.output_begin_idx;
    if (output_begin_idx < 0) {
@ -409,7 +412,7 @@ bool save_results(const SDCliParams& cli_params,
    auto write_image = [&](const fs::path& path, int idx) {
        const sd_image_t& img = results[idx];
        if (!img.data)
-            return;
+            return false;

        std::string params = get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + idx);
        int ok             = 0;
@ -419,8 +422,11 @@ bool save_results(const SDCliParams& cli_params,
            ok = stbi_write_png(path.string().c_str(), img.width, img.height, img.channel, img.data, 0, params.c_str());
        }
        LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure");
+        return ok != 0;
    };

+    int sucessful_reults = 0;
+
    if (std::regex_search(cli_params.output_path, format_specifier_regex)) {
        if (!is_jpg && ext_lower != ".png")
            ext = ".png";
@ -429,9 +435,12 @@ bool save_results(const SDCliParams& cli_params,

        for (int i = 0; i < num_results; ++i) {
            fs::path img_path = format_frame_idx(pattern.string(), output_begin_idx + i);
-            write_image(img_path, i);
+            if (write_image(img_path, i)) {
+                sucessful_reults++;
            }
-        return true;
+        }
+        LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
+        return sucessful_reults != 0;
    }

    if (cli_params.mode == VID_GEN && num_results > 1) {
@ -439,9 +448,13 @@ bool save_results(const SDCliParams& cli_params,
            ext = ".avi";
        fs::path video_path = base_path;
        video_path += ext;
-        create_mjpg_avi_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps);
+        if (create_mjpg_avi_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps) == 0) {
            LOG_INFO("save result MJPG AVI video to '%s'", video_path.string().c_str());
            return true;
+        } else {
+            LOG_ERROR("Failed to save result MPG AVI video to '%s'", video_path.string().c_str());
+            return false;
+        }
    }

    if (!is_jpg && ext_lower != ".png")
@ -453,10 +466,12 @@ bool save_results(const SDCliParams& cli_params,
            img_path += "_" + std::to_string(output_begin_idx + i);
        }
        img_path += ext;
-        write_image(img_path, i);
+        if (write_image(img_path, i)) {
+            sucessful_reults++;
        }
-
-    return true;
+    }
+    LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
+    return sucessful_reults != 0;
 }

 int main(int argc, const char* argv[]) {
--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
@ -786,8 +786,8 @@ int main(int argc, const char** argv) {
            std::string negative_prompt = j.value("negative_prompt", "");
            int width                   = j.value("width", 512);
            int height                  = j.value("height", 512);
-            int steps                   = j.value("steps", -1);
-            float cfg_scale             = j.value("cfg_scale", 7.f);
+            int steps                   = j.value("steps", default_gen_params.sample_params.sample_steps);
+            float cfg_scale             = j.value("cfg_scale", default_gen_params.sample_params.guidance.txt_cfg);
            int64_t seed                = j.value("seed", -1);
            int batch_size              = j.value("batch_size", 1);
            int clip_skip               = j.value("clip_skip", -1);
@ -883,8 +883,6 @@ int main(int argc, const char** argv) {

            enum scheduler_t scheduler = str_to_scheduler(scheduler_name.c_str());

-            // avoid excessive resource usage
-
            SDGenerationParams gen_params             = default_gen_params;
            gen_params.prompt                         = prompt;
            gen_params.negative_prompt                = negative_prompt;
@ -893,6 +891,7 @@ int main(int argc, const char** argv) {
            gen_params.seed                           = seed;
            gen_params.sample_params.sample_steps     = steps;
            gen_params.batch_count                    = batch_size;
+            gen_params.sample_params.guidance.txt_cfg = cfg_scale;

            if (clip_skip > 0) {
                gen_params.clip_skip = clip_skip;
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -767,7 +767,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor*
    return x;
 }

-typedef std::function<void(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;
+typedef std::function<bool(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;

 __STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim,
                                            float& tile_overlap_factor_dim,
@ -918,12 +918,15 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,

            int64_t t1 = ggml_time_ms();
            ggml_ext_tensor_split_2d(input, input_tile, x_in, y_in);
-            on_processing(input_tile, output_tile, false);
+            if (on_processing(input_tile, output_tile, false)) {
                ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy);

                int64_t t2 = ggml_time_ms();
                last_time  = (t2 - t1) / 1000.0f;
                pretty_progress(tile_count, num_tiles, last_time);
+            } else {
+                LOG_ERROR("Failed to process patch %d at (%d, %d)", tile_count, x, y);
+            }
            tile_count++;
        }
        last_x = false;
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -1558,7 +1558,7 @@ public:
                if (vae_tiling_params.enabled) {
                    // split latent in 32x32 tiles and compute in several steps
                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                        first_stage_model->compute(n_threads, in, true, &out, nullptr);
+                        return first_stage_model->compute(n_threads, in, true, &out, nullptr);
                    };
                    silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling);

@ -1577,7 +1577,7 @@ public:
                if (vae_tiling_params.enabled) {
                    // split latent in 64x64 tiles and compute in several steps
                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                        tae_first_stage->compute(n_threads, in, true, &out, nullptr);
+                        return tae_first_stage->compute(n_threads, in, true, &out, nullptr);
                    };
                    silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling);
                } else {
@ -2546,7 +2546,7 @@ public:
                LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);

                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    first_stage_model->compute(n_threads, in, false, &out, work_ctx);
+                    return first_stage_model->compute(n_threads, in, false, &out, work_ctx);
                };
                sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
            } else {
@ -2557,7 +2557,7 @@ public:
            if (vae_tiling_params.enabled && !encode_video) {
                // split latent in 32x32 tiles and compute in several steps
                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    tae_first_stage->compute(n_threads, in, false, &out, nullptr);
+                    return tae_first_stage->compute(n_threads, in, false, &out, nullptr);
                };
                sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
            } else {
@ -2675,11 +2675,15 @@ public:

                // split latent in 32x32 tiles and compute in several steps
                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    first_stage_model->compute(n_threads, in, true, &out, nullptr);
+                    return first_stage_model->compute(n_threads, in, true, &out, nullptr);
                };
                sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
            } else {
-                first_stage_model->compute(n_threads, x, true, &result, work_ctx);
+                if(!first_stage_model->compute(n_threads, x, true, &result, work_ctx)){
+                    LOG_ERROR("Failed to decode latetnts");
+                    first_stage_model->free_compute_buffer();
+                    return nullptr;
+                }
            }
            first_stage_model->free_compute_buffer();
            process_vae_output_tensor(result);
@ -2687,11 +2691,15 @@ public:
            if (vae_tiling_params.enabled) {
                // split latent in 64x64 tiles and compute in several steps
                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    tae_first_stage->compute(n_threads, in, true, &out);
+                    return tae_first_stage->compute(n_threads, in, true, &out);
                };
                sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
            } else {
-                tae_first_stage->compute(n_threads, x, true, &result);
+                if(!tae_first_stage->compute(n_threads, x, true, &result)){
+                    LOG_ERROR("Failed to decode latetnts");
+                    tae_first_stage->free_compute_buffer();
+                    return nullptr;
+                }
            }
            tae_first_stage->free_compute_buffer();
        }
@ -3461,6 +3469,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
        ggml_free(work_ctx);
        return nullptr;
    }
+    memset(result_images, 0, batch_count * sizeof(sd_image_t));

    for (size_t i = 0; i < decoded_images.size(); i++) {
        result_images[i].width   = width;
--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -89,7 +89,7 @@ struct UpscalerGGML {

        ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
        auto on_tiling        = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-            esrgan_upscaler->compute(n_threads, in, &out);
+            return esrgan_upscaler->compute(n_threads, in, &out);
        };
        int64_t t0 = ggml_time_ms();
        sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling);
Author	SHA1	Message	Date
leejet	3296545090	feat: add extra_c_crossattns support for llm embedder (#1265 )	2026-02-10 00:00:17 +08:00
akleine	d60fb27560	fix: avoid unwanted file extension changes (#1257 )	2026-02-09 23:59:43 +08:00
Wagner Bruna	c7ccafbd6f	fix: correct sdapi handling of cfg_scale and steps (#1260 )	2026-02-09 23:34:19 +08:00
stduhpf	aa0b899397	fix: improve handling of VAE decode failures (#1222 )	2026-02-09 23:29:41 +08:00
Mario Limonciello	5e264372ce	ci: add a github action to generate a Linux ROCm artifact (#1258 )	2026-02-09 23:23:06 +08:00