feat: SDXS-09 support and update doc (#1356)

This commit is contained in:
akleine 2026-04-16 19:11:44 +02:00 committed by GitHub
parent 5c243db9a8
commit d73b4198a4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 51 additions and 48 deletions

View File

@ -87,51 +87,32 @@ pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
```bash ```bash
python convert_diffusers_to_original_stable_diffusion.py \ python convert_diffusers_to_original_stable_diffusion.py \
--model_path ./segmindtiny-sd \ --model_path ./segmindtiny-sd \
--checkpoint_path ./segmind_tiny-sd.ckpt --half --checkpoint_path ./segmind_tiny-sd.safetensors --half --use_safetensors
``` ```
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above. The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
##### Another available .ckpt file: ### SDXS-512-DreamShaper
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
To use this file, you must first adjust its non-contiguous tensors:
```python
import torch
ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
for key, value in ckpt['state_dict'].items():
if isinstance(value, torch.Tensor):
ckpt['state_dict'][key] = value.contiguous()
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
```
### SDXS-512
Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part. Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
##### Some ready-to-run SDXS-512 model files are available online, such as:
##### 1. Download the diffusers model from Hugging Face using Python: * https://huggingface.co/akleine/sdxs-512
* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF
```python
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
pipe.save_pretrained(save_directory="sdxs")
```
##### 2. Create a safetensors file
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
```
##### 3. Run the model as follows:
##### Run the model as follows:
```bash ```bash
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \ ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
--cfg-scale 1 --steps 1 --cfg-scale 1 --steps 1
``` ```
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here. Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.
### SDXS-512-0.9
Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different* but also **incredibly fast**. Sometimes it is preferred, so try it yourself.
##### Download a ready-to-run file from here:
* https://huggingface.co/akleine/sdxs-09
For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary.

View File

@ -277,6 +277,7 @@ protected:
int64_t context_dim; int64_t context_dim;
int64_t n_head; int64_t n_head;
int64_t d_head; int64_t d_head;
bool xtra_dim = false;
public: public:
CrossAttention(int64_t query_dim, CrossAttention(int64_t query_dim,
@ -288,7 +289,11 @@ public:
query_dim(query_dim), query_dim(query_dim),
context_dim(context_dim) { context_dim(context_dim) {
int64_t inner_dim = d_head * n_head; int64_t inner_dim = d_head * n_head;
if (context_dim == 320 && d_head == 320) {
// LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
xtra_dim = true;
context_dim = 1024;
}
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false)); blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false)); blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false)); blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
@ -313,10 +318,16 @@ public:
int64_t n_context = context->ne[1]; int64_t n_context = context->ne[1];
int64_t inner_dim = d_head * n_head; int64_t inner_dim = d_head * n_head;
auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim] auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
if (xtra_dim) {
// LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
context->ne[0] = 1024; // patch dim
}
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim] auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim] auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
if (xtra_dim) {
context->ne[0] = 320; // reset dim to orig
}
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim] x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim] x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]

View File

@ -1019,6 +1019,7 @@ SDVersion ModelLoader::get_sd_version() {
bool has_middle_block_1 = false; bool has_middle_block_1 = false;
bool has_output_block_311 = false; bool has_output_block_311 = false;
bool has_output_block_71 = false; bool has_output_block_71 = false;
bool has_attn_1024 = false;
for (auto& [name, tensor_storage] : tensor_storage_map) { for (auto& [name, tensor_storage] : tensor_storage_map) {
if (!(is_xl)) { if (!(is_xl)) {
@ -1091,6 +1092,10 @@ SDVersion ModelLoader::get_sd_version() {
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos || if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos ||
tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) { tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) {
has_output_block_71 = true; has_output_block_71 = true;
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) {
if (tensor_storage.ne[0] == 1024)
has_attn_1024 = true;
}
} }
if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" || if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
tensor_storage.name == "cond_stage_model.model.token_embedding.weight" || tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
@ -1164,7 +1169,7 @@ SDVersion ModelLoader::get_sd_version() {
} }
if (!has_middle_block_1) { if (!has_middle_block_1) {
if (!has_output_block_71) { if (!has_output_block_71) {
return VERSION_SDXS; return VERSION_SDXS_512_DS;
} }
return VERSION_SD1_TINY_UNET; return VERSION_SD1_TINY_UNET;
} }
@ -1174,7 +1179,7 @@ SDVersion ModelLoader::get_sd_version() {
return VERSION_SD2_INPAINT; return VERSION_SD2_INPAINT;
} }
if (!has_middle_block_1) { if (!has_middle_block_1) {
return VERSION_SD2_TINY_UNET; return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET;
} }
return VERSION_SD2; return VERSION_SD2;
} }

View File

@ -28,7 +28,8 @@ enum SDVersion {
VERSION_SD2, VERSION_SD2,
VERSION_SD2_INPAINT, VERSION_SD2_INPAINT,
VERSION_SD2_TINY_UNET, VERSION_SD2_TINY_UNET,
VERSION_SDXS, VERSION_SDXS_512_DS,
VERSION_SDXS_09,
VERSION_SDXL, VERSION_SDXL,
VERSION_SDXL_INPAINT, VERSION_SDXL_INPAINT,
VERSION_SDXL_PIX2PIX, VERSION_SDXL_PIX2PIX,
@ -55,14 +56,14 @@ enum SDVersion {
}; };
static inline bool sd_version_is_sd1(SDVersion version) { static inline bool sd_version_is_sd1(SDVersion version) {
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) { if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS_512_DS) {
return true; return true;
} }
return false; return false;
} }
static inline bool sd_version_is_sd2(SDVersion version) { static inline bool sd_version_is_sd2(SDVersion version) {
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) { if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) {
return true; return true;
} }
return false; return false;

View File

@ -1120,7 +1120,7 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
for (const auto& prefix : first_stage_model_prefix_vec) { for (const auto& prefix : first_stage_model_prefix_vec) {
if (starts_with(name, prefix)) { if (starts_with(name, prefix)) {
name = convert_first_stage_model_name(name.substr(prefix.size()), prefix); name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
if (version == VERSION_SDXS) { if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
name = "tae." + name; name = "tae." + name;
} else { } else {
name = prefix + name; name = prefix + name;

View File

@ -30,7 +30,8 @@ const char* model_version_to_str[] = {
"SD 2.x", "SD 2.x",
"SD 2.x Inpaint", "SD 2.x Inpaint",
"SD 2.x Tiny UNet", "SD 2.x Tiny UNet",
"SDXS", "SDXS (512-DS)",
"SDXS (09)",
"SDXL", "SDXL",
"SDXL Inpaint", "SDXL Inpaint",
"SDXL Instruct-Pix2Pix", "SDXL Instruct-Pix2Pix",
@ -414,7 +415,7 @@ public:
} }
bool tae_preview_only = sd_ctx_params->tae_preview_only; bool tae_preview_only = sd_ctx_params->tae_preview_only;
if (version == VERSION_SDXS) { if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
tae_preview_only = false; tae_preview_only = false;
use_tae = true; use_tae = true;
} }

View File

@ -217,11 +217,11 @@ public:
} else if (sd_version_is_unet_edit(version)) { } else if (sd_version_is_unet_edit(version)) {
in_channels = 8; in_channels = 8;
} }
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) { if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
num_res_blocks = 1; num_res_blocks = 1;
channel_mult = {1, 2, 4}; channel_mult = {1, 2, 4};
tiny_unet = true; tiny_unet = true;
if (version == VERSION_SDXS) { if (version == VERSION_SDXS_512_DS) {
attention_resolutions = {4, 2}; // here just like SDXL attention_resolutions = {4, 2}; // here just like SDXL
} }
} }
@ -264,6 +264,10 @@ public:
if (version == VERSION_SVD) { if (version == VERSION_SVD) {
return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection); return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
} else { } else {
if (version == VERSION_SDXS_09 && n_head == 5) {
n_head = 1; // to carry a special case of sdxs_09 into CrossAttentionLayer,
d_head = 320; // works as long the product remains equal (5*64 == 1*320)
}
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection); return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
} }
}; };