Compare commits

..

No commits in common. "7010bb4dff7bd55b03d35ef9772142c21699eba9" and "885e62ea822e674c6837a8225d2d75f021b97a6a" have entirely different histories.

9 changed files with 13 additions and 97 deletions

View File

@ -83,7 +83,7 @@ python convert_diffusers_to_original_stable_diffusion.py \
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above. The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
##### Another available .ckpt file: ### Another available .ckpt file:
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
@ -97,31 +97,3 @@ for key, value in ckpt['state_dict'].items():
ckpt['state_dict'][key] = value.contiguous() ckpt['state_dict'][key] = value.contiguous()
torch.save(ckpt, "tinySDdistilled_fixed.ckpt") torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
``` ```
### SDXS-512
Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
##### 1. Download the diffusers model from Hugging Face using Python:
```python
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
pipe.save_pretrained(save_directory="sdxs")
```
##### 2. Create a safetensors file
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
```
##### 3. Run the model as follows:
```bash
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
--cfg-scale 1 --steps 1
```
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.

View File

@ -1594,30 +1594,10 @@ struct SDGenerationParams {
load_if_exists("skip_layers", skip_layers); load_if_exists("skip_layers", skip_layers);
load_if_exists("high_noise_skip_layers", high_noise_skip_layers); load_if_exists("high_noise_skip_layers", high_noise_skip_layers);
load_if_exists("steps", sample_params.sample_steps);
load_if_exists("high_noise_steps", high_noise_sample_params.sample_steps);
load_if_exists("cfg_scale", sample_params.guidance.txt_cfg); load_if_exists("cfg_scale", sample_params.guidance.txt_cfg);
load_if_exists("img_cfg_scale", sample_params.guidance.img_cfg); load_if_exists("img_cfg_scale", sample_params.guidance.img_cfg);
load_if_exists("guidance", sample_params.guidance.distilled_guidance); load_if_exists("guidance", sample_params.guidance.distilled_guidance);
auto load_sampler_if_exists = [&](const char* key, enum sample_method_t& out) {
if (j.contains(key) && j[key].is_string()) {
enum sample_method_t tmp = str_to_sample_method(j[key].get<std::string>().c_str());
if (tmp != SAMPLE_METHOD_COUNT) {
out = tmp;
}
}
};
load_sampler_if_exists("sample_method", sample_params.sample_method);
load_sampler_if_exists("high_noise_sample_method", high_noise_sample_params.sample_method);
if (j.contains("scheduler") && j["scheduler"].is_string()) {
enum scheduler_t tmp = str_to_scheduler(j["scheduler"].get<std::string>().c_str());
if (tmp != SCHEDULER_COUNT) {
sample_params.scheduler = tmp;
}
}
return true; return true;
} }

View File

@ -420,9 +420,6 @@ int main(int argc, const char** argv) {
return; return;
} }
if (gen_params.sample_params.sample_steps > 100)
gen_params.sample_params.sample_steps = 100;
if (!gen_params.process_and_check(IMG_GEN, "")) { if (!gen_params.process_and_check(IMG_GEN, "")) {
res.status = 400; res.status = 400;
res.set_content(R"({"error":"invalid params"})", "application/json"); res.set_content(R"({"error":"invalid params"})", "application/json");
@ -601,9 +598,6 @@ int main(int argc, const char** argv) {
return; return;
} }
if (gen_params.sample_params.sample_steps > 100)
gen_params.sample_params.sample_steps = 100;
if (!gen_params.process_and_check(IMG_GEN, "")) { if (!gen_params.process_and_check(IMG_GEN, "")) {
res.status = 400; res.status = 400;
res.set_content(R"({"error":"invalid params"})", "application/json"); res.set_content(R"({"error":"invalid params"})", "application/json");

2
ggml

@ -1 +1 @@
Subproject commit 8891ab6fc742ac1198736d3da3b73c730e42af84 Subproject commit 3e9f2ba3b934c20b26873b3c60dbf41b116978ff

View File

@ -1038,7 +1038,6 @@ SDVersion ModelLoader::get_sd_version() {
int64_t patch_embedding_channels = 0; int64_t patch_embedding_channels = 0;
bool has_img_emb = false; bool has_img_emb = false;
bool has_middle_block_1 = false; bool has_middle_block_1 = false;
bool has_output_block_71 = false;
for (auto& [name, tensor_storage] : tensor_storage_map) { for (auto& [name, tensor_storage] : tensor_storage_map) {
if (!(is_xl)) { if (!(is_xl)) {
@ -1095,9 +1094,6 @@ SDVersion ModelLoader::get_sd_version() {
tensor_storage.name.find("unet.mid_block.resnets.1.") != std::string::npos) { tensor_storage.name.find("unet.mid_block.resnets.1.") != std::string::npos) {
has_middle_block_1 = true; has_middle_block_1 = true;
} }
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos) {
has_output_block_71 = true;
}
if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" || if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
tensor_storage.name == "cond_stage_model.model.token_embedding.weight" || tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
tensor_storage.name == "text_model.embeddings.token_embedding.weight" || tensor_storage.name == "text_model.embeddings.token_embedding.weight" ||
@ -1159,9 +1155,6 @@ SDVersion ModelLoader::get_sd_version() {
return VERSION_SD1_PIX2PIX; return VERSION_SD1_PIX2PIX;
} }
if (!has_middle_block_1) { if (!has_middle_block_1) {
if (!has_output_block_71) {
return VERSION_SDXS;
}
return VERSION_SD1_TINY_UNET; return VERSION_SD1_TINY_UNET;
} }
return VERSION_SD1; return VERSION_SD1;

View File

@ -28,7 +28,6 @@ enum SDVersion {
VERSION_SD2, VERSION_SD2,
VERSION_SD2_INPAINT, VERSION_SD2_INPAINT,
VERSION_SD2_TINY_UNET, VERSION_SD2_TINY_UNET,
VERSION_SDXS,
VERSION_SDXL, VERSION_SDXL,
VERSION_SDXL_INPAINT, VERSION_SDXL_INPAINT,
VERSION_SDXL_PIX2PIX, VERSION_SDXL_PIX2PIX,
@ -51,7 +50,7 @@ enum SDVersion {
}; };
static inline bool sd_version_is_sd1(SDVersion version) { static inline bool sd_version_is_sd1(SDVersion version) {
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) { if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET) {
return true; return true;
} }
return false; return false;

View File

@ -31,7 +31,6 @@ const char* model_version_to_str[] = {
"SD 2.x", "SD 2.x",
"SD 2.x Inpaint", "SD 2.x Inpaint",
"SD 2.x Tiny UNet", "SD 2.x Tiny UNet",
"SDXS",
"SDXL", "SDXL",
"SDXL Inpaint", "SDXL Inpaint",
"SDXL Instruct-Pix2Pix", "SDXL Instruct-Pix2Pix",
@ -408,11 +407,6 @@ public:
vae_decode_only = false; vae_decode_only = false;
} }
bool tae_preview_only = sd_ctx_params->tae_preview_only;
if (version == VERSION_SDXS) {
tae_preview_only = false;
}
if (sd_ctx_params->circular_x || sd_ctx_params->circular_y) { if (sd_ctx_params->circular_x || sd_ctx_params->circular_y) {
LOG_INFO("Using circular padding for convolutions"); LOG_INFO("Using circular padding for convolutions");
} }
@ -597,7 +591,7 @@ public:
vae_backend = backend; vae_backend = backend;
} }
if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) { if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend, first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
offload_params_to_cpu, offload_params_to_cpu,
@ -635,7 +629,8 @@ public:
first_stage_model->get_param_tensors(tensors, "first_stage_model"); first_stage_model->get_param_tensors(tensors, "first_stage_model");
} }
} }
if (use_tiny_autoencoder || version == VERSION_SDXS) {
if (use_tiny_autoencoder) {
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(vae_backend, tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(vae_backend,
offload_params_to_cpu, offload_params_to_cpu,
@ -650,10 +645,6 @@ public:
"decoder.layers", "decoder.layers",
vae_decode_only, vae_decode_only,
version); version);
if (version == VERSION_SDXS) {
tae_first_stage->alloc_params_buffer();
tae_first_stage->get_param_tensors(tensors, "first_stage_model");
}
} }
if (sd_ctx_params->vae_conv_direct) { if (sd_ctx_params->vae_conv_direct) {
LOG_INFO("Using Conv2d direct in the tae model"); LOG_INFO("Using Conv2d direct in the tae model");
@ -791,15 +782,14 @@ public:
unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size(); unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size();
} }
size_t vae_params_mem_size = 0; size_t vae_params_mem_size = 0;
if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) { if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
vae_params_mem_size = first_stage_model->get_params_buffer_size(); vae_params_mem_size = first_stage_model->get_params_buffer_size();
} }
if (use_tiny_autoencoder || version == VERSION_SDXS) { if (use_tiny_autoencoder) {
if (use_tiny_autoencoder && !tae_first_stage->load_from_file(taesd_path, n_threads)) { if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {
return false; return false;
} }
use_tiny_autoencoder = true; // now the processing is identical for VERSION_SDXS vae_params_mem_size = tae_first_stage->get_params_buffer_size();
vae_params_mem_size = tae_first_stage->get_params_buffer_size();
} }
size_t control_net_params_mem_size = 0; size_t control_net_params_mem_size = 0;
if (control_net) { if (control_net) {
@ -955,7 +945,7 @@ public:
} }
ggml_free(ctx); ggml_free(ctx);
use_tiny_autoencoder = use_tiny_autoencoder && !tae_preview_only; use_tiny_autoencoder = use_tiny_autoencoder && !sd_ctx_params->tae_preview_only;
return true; return true;
} }

11
tae.hpp
View File

@ -505,8 +505,7 @@ struct TinyAutoEncoder : public GGMLRunner {
struct ggml_tensor** output, struct ggml_tensor** output,
struct ggml_context* output_ctx = nullptr) = 0; struct ggml_context* output_ctx = nullptr) = 0;
virtual bool load_from_file(const std::string& file_path, int n_threads) = 0; virtual bool load_from_file(const std::string& file_path, int n_threads) = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
}; };
struct TinyImageAutoEncoder : public TinyAutoEncoder { struct TinyImageAutoEncoder : public TinyAutoEncoder {
@ -556,10 +555,6 @@ struct TinyImageAutoEncoder : public TinyAutoEncoder {
return success; return success;
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
taesd.get_param_tensors(tensors, prefix);
}
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
z = to_backend(z); z = to_backend(z);
@ -629,10 +624,6 @@ struct TinyVideoAutoEncoder : public TinyAutoEncoder {
return success; return success;
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
taehv.get_param_tensors(tensors, prefix);
}
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
z = to_backend(z); z = to_backend(z);

View File

@ -215,13 +215,10 @@ public:
} else if (sd_version_is_unet_edit(version)) { } else if (sd_version_is_unet_edit(version)) {
in_channels = 8; in_channels = 8;
} }
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) { if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET) {
num_res_blocks = 1; num_res_blocks = 1;
channel_mult = {1, 2, 4}; channel_mult = {1, 2, 4};
tiny_unet = true; tiny_unet = true;
if (version == VERSION_SDXS) {
attention_resolutions = {4, 2}; // here just like SDXL
}
} }
// dims is always 2 // dims is always 2