mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-05-08 16:28:53 +00:00
feat: SDXS-09 support and update doc (#1356)
This commit is contained in:
parent
5c243db9a8
commit
d73b4198a4
@ -87,51 +87,32 @@ pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
|
||||
```bash
|
||||
python convert_diffusers_to_original_stable_diffusion.py \
|
||||
--model_path ./segmindtiny-sd \
|
||||
--checkpoint_path ./segmind_tiny-sd.ckpt --half
|
||||
--checkpoint_path ./segmind_tiny-sd.safetensors --half --use_safetensors
|
||||
```
|
||||
|
||||
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
|
||||
The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
|
||||
|
||||
|
||||
##### Another available .ckpt file:
|
||||
|
||||
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
|
||||
|
||||
To use this file, you must first adjust its non-contiguous tensors:
|
||||
|
||||
```python
|
||||
import torch
|
||||
ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
|
||||
for key, value in ckpt['state_dict'].items():
|
||||
if isinstance(value, torch.Tensor):
|
||||
ckpt['state_dict'][key] = value.contiguous()
|
||||
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
|
||||
```
|
||||
|
||||
|
||||
### SDXS-512
|
||||
### SDXS-512-DreamShaper
|
||||
|
||||
Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
|
||||
##### Some ready-to-run SDXS-512 model files are available online, such as:
|
||||
|
||||
##### 1. Download the diffusers model from Hugging Face using Python:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
|
||||
pipe.save_pretrained(save_directory="sdxs")
|
||||
```
|
||||
##### 2. Create a safetensors file
|
||||
|
||||
```bash
|
||||
python convert_diffusers_to_original_stable_diffusion.py \
|
||||
--model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
|
||||
```
|
||||
|
||||
##### 3. Run the model as follows:
|
||||
* https://huggingface.co/akleine/sdxs-512
|
||||
* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF
|
||||
|
||||
##### Run the model as follows:
|
||||
```bash
|
||||
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
|
||||
--cfg-scale 1 --steps 1
|
||||
```
|
||||
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.
|
||||
|
||||
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.
|
||||
### SDXS-512-0.9
|
||||
|
||||
Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different* but also **incredibly fast**. Sometimes it is preferred, so try it yourself.
|
||||
##### Download a ready-to-run file from here:
|
||||
|
||||
* https://huggingface.co/akleine/sdxs-09
|
||||
|
||||
For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary.
|
||||
|
||||
@ -277,6 +277,7 @@ protected:
|
||||
int64_t context_dim;
|
||||
int64_t n_head;
|
||||
int64_t d_head;
|
||||
bool xtra_dim = false;
|
||||
|
||||
public:
|
||||
CrossAttention(int64_t query_dim,
|
||||
@ -288,7 +289,11 @@ public:
|
||||
query_dim(query_dim),
|
||||
context_dim(context_dim) {
|
||||
int64_t inner_dim = d_head * n_head;
|
||||
|
||||
if (context_dim == 320 && d_head == 320) {
|
||||
// LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
|
||||
xtra_dim = true;
|
||||
context_dim = 1024;
|
||||
}
|
||||
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
|
||||
blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
|
||||
blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
|
||||
@ -313,10 +318,16 @@ public:
|
||||
int64_t n_context = context->ne[1];
|
||||
int64_t inner_dim = d_head * n_head;
|
||||
|
||||
auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
|
||||
auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
|
||||
if (xtra_dim) {
|
||||
// LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
|
||||
context->ne[0] = 1024; // patch dim
|
||||
}
|
||||
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
|
||||
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
|
||||
|
||||
if (xtra_dim) {
|
||||
context->ne[0] = 320; // reset dim to orig
|
||||
}
|
||||
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
|
||||
|
||||
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
|
||||
|
||||
@ -1019,6 +1019,7 @@ SDVersion ModelLoader::get_sd_version() {
|
||||
bool has_middle_block_1 = false;
|
||||
bool has_output_block_311 = false;
|
||||
bool has_output_block_71 = false;
|
||||
bool has_attn_1024 = false;
|
||||
|
||||
for (auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!(is_xl)) {
|
||||
@ -1091,6 +1092,10 @@ SDVersion ModelLoader::get_sd_version() {
|
||||
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos ||
|
||||
tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) {
|
||||
has_output_block_71 = true;
|
||||
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) {
|
||||
if (tensor_storage.ne[0] == 1024)
|
||||
has_attn_1024 = true;
|
||||
}
|
||||
}
|
||||
if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
|
||||
tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
|
||||
@ -1164,7 +1169,7 @@ SDVersion ModelLoader::get_sd_version() {
|
||||
}
|
||||
if (!has_middle_block_1) {
|
||||
if (!has_output_block_71) {
|
||||
return VERSION_SDXS;
|
||||
return VERSION_SDXS_512_DS;
|
||||
}
|
||||
return VERSION_SD1_TINY_UNET;
|
||||
}
|
||||
@ -1174,7 +1179,7 @@ SDVersion ModelLoader::get_sd_version() {
|
||||
return VERSION_SD2_INPAINT;
|
||||
}
|
||||
if (!has_middle_block_1) {
|
||||
return VERSION_SD2_TINY_UNET;
|
||||
return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET;
|
||||
}
|
||||
return VERSION_SD2;
|
||||
}
|
||||
|
||||
@ -28,7 +28,8 @@ enum SDVersion {
|
||||
VERSION_SD2,
|
||||
VERSION_SD2_INPAINT,
|
||||
VERSION_SD2_TINY_UNET,
|
||||
VERSION_SDXS,
|
||||
VERSION_SDXS_512_DS,
|
||||
VERSION_SDXS_09,
|
||||
VERSION_SDXL,
|
||||
VERSION_SDXL_INPAINT,
|
||||
VERSION_SDXL_PIX2PIX,
|
||||
@ -55,14 +56,14 @@ enum SDVersion {
|
||||
};
|
||||
|
||||
static inline bool sd_version_is_sd1(SDVersion version) {
|
||||
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) {
|
||||
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS_512_DS) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool sd_version_is_sd2(SDVersion version) {
|
||||
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
|
||||
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
@ -1120,7 +1120,7 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
|
||||
for (const auto& prefix : first_stage_model_prefix_vec) {
|
||||
if (starts_with(name, prefix)) {
|
||||
name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
|
||||
if (version == VERSION_SDXS) {
|
||||
if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
|
||||
name = "tae." + name;
|
||||
} else {
|
||||
name = prefix + name;
|
||||
|
||||
@ -30,7 +30,8 @@ const char* model_version_to_str[] = {
|
||||
"SD 2.x",
|
||||
"SD 2.x Inpaint",
|
||||
"SD 2.x Tiny UNet",
|
||||
"SDXS",
|
||||
"SDXS (512-DS)",
|
||||
"SDXS (09)",
|
||||
"SDXL",
|
||||
"SDXL Inpaint",
|
||||
"SDXL Instruct-Pix2Pix",
|
||||
@ -414,7 +415,7 @@ public:
|
||||
}
|
||||
|
||||
bool tae_preview_only = sd_ctx_params->tae_preview_only;
|
||||
if (version == VERSION_SDXS) {
|
||||
if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
|
||||
tae_preview_only = false;
|
||||
use_tae = true;
|
||||
}
|
||||
|
||||
@ -217,11 +217,11 @@ public:
|
||||
} else if (sd_version_is_unet_edit(version)) {
|
||||
in_channels = 8;
|
||||
}
|
||||
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
|
||||
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
|
||||
num_res_blocks = 1;
|
||||
channel_mult = {1, 2, 4};
|
||||
tiny_unet = true;
|
||||
if (version == VERSION_SDXS) {
|
||||
if (version == VERSION_SDXS_512_DS) {
|
||||
attention_resolutions = {4, 2}; // here just like SDXL
|
||||
}
|
||||
}
|
||||
@ -264,6 +264,10 @@ public:
|
||||
if (version == VERSION_SVD) {
|
||||
return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
|
||||
} else {
|
||||
if (version == VERSION_SDXS_09 && n_head == 5) {
|
||||
n_head = 1; // to carry a special case of sdxs_09 into CrossAttentionLayer,
|
||||
d_head = 320; // works as long the product remains equal (5*64 == 1*320)
|
||||
}
|
||||
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
|
||||
}
|
||||
};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user