mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-05-08 16:28:53 +00:00
feat: SDXS-09 support and update doc (#1356)
This commit is contained in:
parent
5c243db9a8
commit
d73b4198a4
@ -87,51 +87,32 @@ pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
|
|||||||
```bash
|
```bash
|
||||||
python convert_diffusers_to_original_stable_diffusion.py \
|
python convert_diffusers_to_original_stable_diffusion.py \
|
||||||
--model_path ./segmindtiny-sd \
|
--model_path ./segmindtiny-sd \
|
||||||
--checkpoint_path ./segmind_tiny-sd.ckpt --half
|
--checkpoint_path ./segmind_tiny-sd.safetensors --half --use_safetensors
|
||||||
```
|
```
|
||||||
|
|
||||||
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
|
The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
|
||||||
|
|
||||||
|
|
||||||
##### Another available .ckpt file:
|
### SDXS-512-DreamShaper
|
||||||
|
|
||||||
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
|
|
||||||
|
|
||||||
To use this file, you must first adjust its non-contiguous tensors:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import torch
|
|
||||||
ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
|
|
||||||
for key, value in ckpt['state_dict'].items():
|
|
||||||
if isinstance(value, torch.Tensor):
|
|
||||||
ckpt['state_dict'][key] = value.contiguous()
|
|
||||||
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
### SDXS-512
|
|
||||||
|
|
||||||
Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
|
Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
|
||||||
|
##### Some ready-to-run SDXS-512 model files are available online, such as:
|
||||||
|
|
||||||
##### 1. Download the diffusers model from Hugging Face using Python:
|
* https://huggingface.co/akleine/sdxs-512
|
||||||
|
* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF
|
||||||
```python
|
|
||||||
from diffusers import StableDiffusionPipeline
|
|
||||||
pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
|
|
||||||
pipe.save_pretrained(save_directory="sdxs")
|
|
||||||
```
|
|
||||||
##### 2. Create a safetensors file
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python convert_diffusers_to_original_stable_diffusion.py \
|
|
||||||
--model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
|
|
||||||
```
|
|
||||||
|
|
||||||
##### 3. Run the model as follows:
|
|
||||||
|
|
||||||
|
##### Run the model as follows:
|
||||||
```bash
|
```bash
|
||||||
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
|
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
|
||||||
--cfg-scale 1 --steps 1
|
--cfg-scale 1 --steps 1
|
||||||
```
|
```
|
||||||
|
|
||||||
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.
|
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.
|
||||||
|
|
||||||
|
### SDXS-512-0.9
|
||||||
|
|
||||||
|
Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different* but also **incredibly fast**. Sometimes it is preferred, so try it yourself.
|
||||||
|
##### Download a ready-to-run file from here:
|
||||||
|
|
||||||
|
* https://huggingface.co/akleine/sdxs-09
|
||||||
|
|
||||||
|
For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary.
|
||||||
|
|||||||
@ -277,6 +277,7 @@ protected:
|
|||||||
int64_t context_dim;
|
int64_t context_dim;
|
||||||
int64_t n_head;
|
int64_t n_head;
|
||||||
int64_t d_head;
|
int64_t d_head;
|
||||||
|
bool xtra_dim = false;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CrossAttention(int64_t query_dim,
|
CrossAttention(int64_t query_dim,
|
||||||
@ -288,7 +289,11 @@ public:
|
|||||||
query_dim(query_dim),
|
query_dim(query_dim),
|
||||||
context_dim(context_dim) {
|
context_dim(context_dim) {
|
||||||
int64_t inner_dim = d_head * n_head;
|
int64_t inner_dim = d_head * n_head;
|
||||||
|
if (context_dim == 320 && d_head == 320) {
|
||||||
|
// LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
|
||||||
|
xtra_dim = true;
|
||||||
|
context_dim = 1024;
|
||||||
|
}
|
||||||
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
|
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
|
||||||
blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
|
blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
|
||||||
blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
|
blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
|
||||||
@ -313,10 +318,16 @@ public:
|
|||||||
int64_t n_context = context->ne[1];
|
int64_t n_context = context->ne[1];
|
||||||
int64_t inner_dim = d_head * n_head;
|
int64_t inner_dim = d_head * n_head;
|
||||||
|
|
||||||
auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
|
auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
|
||||||
|
if (xtra_dim) {
|
||||||
|
// LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
|
||||||
|
context->ne[0] = 1024; // patch dim
|
||||||
|
}
|
||||||
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
|
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
|
||||||
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
|
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
|
||||||
|
if (xtra_dim) {
|
||||||
|
context->ne[0] = 320; // reset dim to orig
|
||||||
|
}
|
||||||
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
|
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
|
||||||
|
|
||||||
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
|
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
|
||||||
|
|||||||
@ -1019,6 +1019,7 @@ SDVersion ModelLoader::get_sd_version() {
|
|||||||
bool has_middle_block_1 = false;
|
bool has_middle_block_1 = false;
|
||||||
bool has_output_block_311 = false;
|
bool has_output_block_311 = false;
|
||||||
bool has_output_block_71 = false;
|
bool has_output_block_71 = false;
|
||||||
|
bool has_attn_1024 = false;
|
||||||
|
|
||||||
for (auto& [name, tensor_storage] : tensor_storage_map) {
|
for (auto& [name, tensor_storage] : tensor_storage_map) {
|
||||||
if (!(is_xl)) {
|
if (!(is_xl)) {
|
||||||
@ -1091,6 +1092,10 @@ SDVersion ModelLoader::get_sd_version() {
|
|||||||
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos ||
|
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos ||
|
||||||
tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) {
|
tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) {
|
||||||
has_output_block_71 = true;
|
has_output_block_71 = true;
|
||||||
|
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) {
|
||||||
|
if (tensor_storage.ne[0] == 1024)
|
||||||
|
has_attn_1024 = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
|
if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
|
||||||
tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
|
tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
|
||||||
@ -1164,7 +1169,7 @@ SDVersion ModelLoader::get_sd_version() {
|
|||||||
}
|
}
|
||||||
if (!has_middle_block_1) {
|
if (!has_middle_block_1) {
|
||||||
if (!has_output_block_71) {
|
if (!has_output_block_71) {
|
||||||
return VERSION_SDXS;
|
return VERSION_SDXS_512_DS;
|
||||||
}
|
}
|
||||||
return VERSION_SD1_TINY_UNET;
|
return VERSION_SD1_TINY_UNET;
|
||||||
}
|
}
|
||||||
@ -1174,7 +1179,7 @@ SDVersion ModelLoader::get_sd_version() {
|
|||||||
return VERSION_SD2_INPAINT;
|
return VERSION_SD2_INPAINT;
|
||||||
}
|
}
|
||||||
if (!has_middle_block_1) {
|
if (!has_middle_block_1) {
|
||||||
return VERSION_SD2_TINY_UNET;
|
return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET;
|
||||||
}
|
}
|
||||||
return VERSION_SD2;
|
return VERSION_SD2;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,7 +28,8 @@ enum SDVersion {
|
|||||||
VERSION_SD2,
|
VERSION_SD2,
|
||||||
VERSION_SD2_INPAINT,
|
VERSION_SD2_INPAINT,
|
||||||
VERSION_SD2_TINY_UNET,
|
VERSION_SD2_TINY_UNET,
|
||||||
VERSION_SDXS,
|
VERSION_SDXS_512_DS,
|
||||||
|
VERSION_SDXS_09,
|
||||||
VERSION_SDXL,
|
VERSION_SDXL,
|
||||||
VERSION_SDXL_INPAINT,
|
VERSION_SDXL_INPAINT,
|
||||||
VERSION_SDXL_PIX2PIX,
|
VERSION_SDXL_PIX2PIX,
|
||||||
@ -55,14 +56,14 @@ enum SDVersion {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static inline bool sd_version_is_sd1(SDVersion version) {
|
static inline bool sd_version_is_sd1(SDVersion version) {
|
||||||
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) {
|
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS_512_DS) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool sd_version_is_sd2(SDVersion version) {
|
static inline bool sd_version_is_sd2(SDVersion version) {
|
||||||
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
|
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@ -1120,7 +1120,7 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
|
|||||||
for (const auto& prefix : first_stage_model_prefix_vec) {
|
for (const auto& prefix : first_stage_model_prefix_vec) {
|
||||||
if (starts_with(name, prefix)) {
|
if (starts_with(name, prefix)) {
|
||||||
name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
|
name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
|
||||||
if (version == VERSION_SDXS) {
|
if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
|
||||||
name = "tae." + name;
|
name = "tae." + name;
|
||||||
} else {
|
} else {
|
||||||
name = prefix + name;
|
name = prefix + name;
|
||||||
|
|||||||
@ -30,7 +30,8 @@ const char* model_version_to_str[] = {
|
|||||||
"SD 2.x",
|
"SD 2.x",
|
||||||
"SD 2.x Inpaint",
|
"SD 2.x Inpaint",
|
||||||
"SD 2.x Tiny UNet",
|
"SD 2.x Tiny UNet",
|
||||||
"SDXS",
|
"SDXS (512-DS)",
|
||||||
|
"SDXS (09)",
|
||||||
"SDXL",
|
"SDXL",
|
||||||
"SDXL Inpaint",
|
"SDXL Inpaint",
|
||||||
"SDXL Instruct-Pix2Pix",
|
"SDXL Instruct-Pix2Pix",
|
||||||
@ -414,7 +415,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool tae_preview_only = sd_ctx_params->tae_preview_only;
|
bool tae_preview_only = sd_ctx_params->tae_preview_only;
|
||||||
if (version == VERSION_SDXS) {
|
if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
|
||||||
tae_preview_only = false;
|
tae_preview_only = false;
|
||||||
use_tae = true;
|
use_tae = true;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -217,11 +217,11 @@ public:
|
|||||||
} else if (sd_version_is_unet_edit(version)) {
|
} else if (sd_version_is_unet_edit(version)) {
|
||||||
in_channels = 8;
|
in_channels = 8;
|
||||||
}
|
}
|
||||||
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
|
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
|
||||||
num_res_blocks = 1;
|
num_res_blocks = 1;
|
||||||
channel_mult = {1, 2, 4};
|
channel_mult = {1, 2, 4};
|
||||||
tiny_unet = true;
|
tiny_unet = true;
|
||||||
if (version == VERSION_SDXS) {
|
if (version == VERSION_SDXS_512_DS) {
|
||||||
attention_resolutions = {4, 2}; // here just like SDXL
|
attention_resolutions = {4, 2}; // here just like SDXL
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -264,6 +264,10 @@ public:
|
|||||||
if (version == VERSION_SVD) {
|
if (version == VERSION_SVD) {
|
||||||
return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
|
return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
|
||||||
} else {
|
} else {
|
||||||
|
if (version == VERSION_SDXS_09 && n_head == 5) {
|
||||||
|
n_head = 1; // to carry a special case of sdxs_09 into CrossAttentionLayer,
|
||||||
|
d_head = 320; // works as long the product remains equal (5*64 == 1*320)
|
||||||
|
}
|
||||||
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
|
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user