feat: SDXS-09 support and update doc (#1356)

2026-05-08 16:28:53 +00:00 · 2026-04-16 19:11:44 +02:00 · 2026-04-16 19:11:44 +02:00 · d73b4198a4
commit d73b4198a4
parent 5c243db9a8
7 changed files with 51 additions and 48 deletions
--- a/docs/distilled_sd.md
+++ b/docs/distilled_sd.md
@ -87,51 +87,32 @@ pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
 ```bash
 python convert_diffusers_to_original_stable_diffusion.py \
      --model_path  ./segmindtiny-sd \
-      --checkpoint_path ./segmind_tiny-sd.ckpt --half
+      --checkpoint_path ./segmind_tiny-sd.safetensors  --half --use_safetensors
 ```
-The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
+The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
-##### Another available .ckpt file:
+### SDXS-512-DreamShaper
 * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
 To use this file, you must first adjust its non-contiguous tensors:
 ```python
 import torch
 ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
 for key, value in ckpt['state_dict'].items():
    if isinstance(value, torch.Tensor):
        ckpt['state_dict'][key] = value.contiguous()
 torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
 ```
 ### SDXS-512
 Another very tiny and **incredibly fast**  model is SDXS by IDKiro et al.  The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
 ##### Some ready-to-run SDXS-512 model files are available online, such as:
-##### 1. Download the diffusers model from  Hugging Face using Python:
+* https://huggingface.co/akleine/sdxs-512
-
+* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF
 ```python
 from diffusers import StableDiffusionPipeline
 pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
 pipe.save_pretrained(save_directory="sdxs")
 ```
 ##### 2. Create a safetensors file
 ```bash
 python convert_diffusers_to_original_stable_diffusion.py \
    --model_path  sdxs  --checkpoint_path sdxs.safetensors --half --use_safetensors
 ```
 ##### 3. Run the model as follows:
 ##### Run the model as follows:
 ```bash
 ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
  --cfg-scale 1 --steps 1
 ```
 Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.
 ### SDXS-512-0.9
 Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different* but also **incredibly fast**. Sometimes it is preferred, so try it yourself.
 ##### Download a ready-to-run file from here:
 * https://huggingface.co/akleine/sdxs-09
 For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary.
--- a/src/common_block.hpp
+++ b/src/common_block.hpp
@ -277,6 +277,7 @@ protected:
    int64_t context_dim;
    int64_t n_head;
    int64_t d_head;
    bool xtra_dim = false;
 public:
    CrossAttention(int64_t query_dim,
@ -288,7 +289,11 @@ public:
          query_dim(query_dim),
          context_dim(context_dim) {
        int64_t inner_dim = d_head * n_head;
-
+        if (context_dim == 320 && d_head == 320) {
            // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
            xtra_dim    = true;
            context_dim = 1024;
        }
        blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
        blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
        blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
@ -313,10 +318,16 @@ public:
        int64_t n_context = context->ne[1];
        int64_t inner_dim = d_head * n_head;
-        auto q = to_q->forward(ctx, x);        // [N, n_token, inner_dim]
+        auto q = to_q->forward(ctx, x);  // [N, n_token, inner_dim]
        if (xtra_dim) {
            // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
            context->ne[0] = 1024;  // patch dim
        }
        auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
        auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]
-
+        if (xtra_dim) {
            context->ne[0] = 320;  // reset dim to orig
        }
        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, inner_dim]
        x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
--- a/src/model.cpp
+++ b/src/model.cpp
@ -1019,6 +1019,7 @@ SDVersion ModelLoader::get_sd_version() {
    bool has_middle_block_1          = false;
    bool has_output_block_311        = false;
    bool has_output_block_71         = false;
    bool has_attn_1024               = false;
    for (auto& [name, tensor_storage] : tensor_storage_map) {
        if (!(is_xl)) {
@ -1091,6 +1092,10 @@ SDVersion ModelLoader::get_sd_version() {
        if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos ||
            tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) {
            has_output_block_71 = true;
            if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) {
                if (tensor_storage.ne[0] == 1024)
                    has_attn_1024 = true;
            }
        }
        if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
            tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
@ -1164,7 +1169,7 @@ SDVersion ModelLoader::get_sd_version() {
        }
        if (!has_middle_block_1) {
            if (!has_output_block_71) {
-                return VERSION_SDXS;
+                return VERSION_SDXS_512_DS;
            }
            return VERSION_SD1_TINY_UNET;
        }
@ -1174,7 +1179,7 @@ SDVersion ModelLoader::get_sd_version() {
            return VERSION_SD2_INPAINT;
        }
        if (!has_middle_block_1) {
-            return VERSION_SD2_TINY_UNET;
+            return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET;
        }
        return VERSION_SD2;
    }
--- a/src/model.h
+++ b/src/model.h
@ -28,7 +28,8 @@ enum SDVersion {
    VERSION_SD2,
    VERSION_SD2_INPAINT,
    VERSION_SD2_TINY_UNET,
-    VERSION_SDXS,
+    VERSION_SDXS_512_DS,
    VERSION_SDXS_09,
    VERSION_SDXL,
    VERSION_SDXL_INPAINT,
    VERSION_SDXL_PIX2PIX,
@ -55,14 +56,14 @@ enum SDVersion {
 };
 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS_512_DS) {
        return true;
    }
    return false;
 }
 static inline bool sd_version_is_sd2(SDVersion version) {
-    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
+    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) {
        return true;
    }
    return false;
--- a/src/name_conversion.cpp
+++ b/src/name_conversion.cpp
@ -1120,7 +1120,7 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
        for (const auto& prefix : first_stage_model_prefix_vec) {
            if (starts_with(name, prefix)) {
                name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
-                if (version == VERSION_SDXS) {
+                if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
                    name = "tae." + name;
                } else {
                    name = prefix + name;
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -30,7 +30,8 @@ const char* model_version_to_str[] = {
    "SD 2.x",
    "SD 2.x Inpaint",
    "SD 2.x Tiny UNet",
-    "SDXS",
+    "SDXS (512-DS)",
    "SDXS (09)",
    "SDXL",
    "SDXL Inpaint",
    "SDXL Instruct-Pix2Pix",
@ -414,7 +415,7 @@ public:
        }
        bool tae_preview_only = sd_ctx_params->tae_preview_only;
-        if (version == VERSION_SDXS) {
+        if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
            tae_preview_only = false;
            use_tae          = true;
        }
--- a/src/unet.hpp
+++ b/src/unet.hpp
@ -217,11 +217,11 @@ public:
        } else if (sd_version_is_unet_edit(version)) {
            in_channels = 8;
        }
-        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
+        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
            num_res_blocks = 1;
            channel_mult   = {1, 2, 4};
            tiny_unet      = true;
-            if (version == VERSION_SDXS) {
+            if (version == VERSION_SDXS_512_DS) {
                attention_resolutions = {4, 2};  // here just like SDXL
            }
        }
@ -264,6 +264,10 @@ public:
            if (version == VERSION_SVD) {
                return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
            } else {
                if (version == VERSION_SDXS_09 && n_head == 5) {
                    n_head = 1;    // to carry a special case of sdxs_09 into CrossAttentionLayer,
                    d_head = 320;  // works as long the product remains equal (5*64 == 1*320)
                }
                return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
            }
        };