2026-06-25 15:46:40 +00:00
9 changed files with 13 additions and 97 deletions
--- a/docs/distilled_sd.md
+++ b/docs/distilled_sd.md
@ -83,7 +83,7 @@ python convert_diffusers_to_original_stable_diffusion.py \
 The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.


-##### Another available .ckpt file:
+### Another available .ckpt file:

 * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt

@ -97,31 +97,3 @@ for key, value in ckpt['state_dict'].items():
        ckpt['state_dict'][key] = value.contiguous()
 torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
 ```
-
-
-### SDXS-512
-
-Another very tiny and **incredibly fast**  model is SDXS by IDKiro et al.  The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
-
-##### 1. Download the diffusers model from  Hugging Face using Python:
-
-```python
-from diffusers import StableDiffusionPipeline
-pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
-pipe.save_pretrained(save_directory="sdxs")
-```
-##### 2. Create a safetensors file
-
-```bash
-python convert_diffusers_to_original_stable_diffusion.py \
-    --model_path  sdxs  --checkpoint_path sdxs.safetensors --half --use_safetensors
-```
-
-##### 3. Run the model as follows:
-
-```bash
-~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
-  --cfg-scale 1 --steps 1
-```
-
-Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.                                                 
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@ -1594,30 +1594,10 @@ struct SDGenerationParams {
        load_if_exists("skip_layers", skip_layers);
        load_if_exists("high_noise_skip_layers", high_noise_skip_layers);

-        load_if_exists("steps", sample_params.sample_steps);
-        load_if_exists("high_noise_steps", high_noise_sample_params.sample_steps);
        load_if_exists("cfg_scale", sample_params.guidance.txt_cfg);
        load_if_exists("img_cfg_scale", sample_params.guidance.img_cfg);
        load_if_exists("guidance", sample_params.guidance.distilled_guidance);

-        auto load_sampler_if_exists = [&](const char* key, enum sample_method_t& out) {
-            if (j.contains(key) && j[key].is_string()) {
-                enum sample_method_t tmp = str_to_sample_method(j[key].get<std::string>().c_str());
-                if (tmp != SAMPLE_METHOD_COUNT) {
-                    out = tmp;
-                }
-            }
-        };
-        load_sampler_if_exists("sample_method", sample_params.sample_method);
-        load_sampler_if_exists("high_noise_sample_method", high_noise_sample_params.sample_method);
-
-        if (j.contains("scheduler") && j["scheduler"].is_string()) {
-            enum scheduler_t tmp = str_to_scheduler(j["scheduler"].get<std::string>().c_str());
-            if (tmp != SCHEDULER_COUNT) {
-                sample_params.scheduler = tmp;
-            }
-        }
-
        return true;
    }

--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
@ -420,9 +420,6 @@ int main(int argc, const char** argv) {
                return;
            }

-            if (gen_params.sample_params.sample_steps > 100)
-                gen_params.sample_params.sample_steps = 100;
-
            if (!gen_params.process_and_check(IMG_GEN, "")) {
                res.status = 400;
                res.set_content(R"({"error":"invalid params"})", "application/json");
@ -601,9 +598,6 @@ int main(int argc, const char** argv) {
                return;
            }

-            if (gen_params.sample_params.sample_steps > 100)
-                gen_params.sample_params.sample_steps = 100;
-
            if (!gen_params.process_and_check(IMG_GEN, "")) {
                res.status = 400;
                res.set_content(R"({"error":"invalid params"})", "application/json");
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 8891ab6fc742ac1198736d3da3b73c730e42af84
+Subproject commit 3e9f2ba3b934c20b26873b3c60dbf41b116978ff
--- a/model.cpp
+++ b/model.cpp
@ -1038,7 +1038,6 @@ SDVersion ModelLoader::get_sd_version() {
    int64_t patch_embedding_channels = 0;
    bool has_img_emb                 = false;
    bool has_middle_block_1          = false;
-    bool has_output_block_71         = false;

    for (auto& [name, tensor_storage] : tensor_storage_map) {
        if (!(is_xl)) {
@ -1095,9 +1094,6 @@ SDVersion ModelLoader::get_sd_version() {
            tensor_storage.name.find("unet.mid_block.resnets.1.") != std::string::npos) {
            has_middle_block_1 = true;
        }
-        if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos) {
-            has_output_block_71 = true;
-        }
        if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
            tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
            tensor_storage.name == "text_model.embeddings.token_embedding.weight" ||
@ -1159,9 +1155,6 @@ SDVersion ModelLoader::get_sd_version() {
            return VERSION_SD1_PIX2PIX;
        }
        if (!has_middle_block_1) {
-            if (!has_output_block_71) {
-                return VERSION_SDXS;
-            }
            return VERSION_SD1_TINY_UNET;
        }
        return VERSION_SD1;
--- a/model.h
+++ b/model.h
@ -28,7 +28,6 @@ enum SDVersion {
    VERSION_SD2,
    VERSION_SD2_INPAINT,
    VERSION_SD2_TINY_UNET,
-    VERSION_SDXS,
    VERSION_SDXL,
    VERSION_SDXL_INPAINT,
    VERSION_SDXL_PIX2PIX,
@ -51,7 +50,7 @@ enum SDVersion {
 };

 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET) {
        return true;
    }
    return false;
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -31,7 +31,6 @@ const char* model_version_to_str[] = {
    "SD 2.x",
    "SD 2.x Inpaint",
    "SD 2.x Tiny UNet",
-    "SDXS",
    "SDXL",
    "SDXL Inpaint",
    "SDXL Instruct-Pix2Pix",
@ -408,11 +407,6 @@ public:
            vae_decode_only = false;
        }

-        bool tae_preview_only = sd_ctx_params->tae_preview_only;
-        if (version == VERSION_SDXS) {
-            tae_preview_only = false;
-        }
-
        if (sd_ctx_params->circular_x || sd_ctx_params->circular_y) {
            LOG_INFO("Using circular padding for convolutions");
        }
@ -597,7 +591,7 @@ public:
                vae_backend = backend;
            }

-            if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
+            if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
                if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
                    first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
                                                                            offload_params_to_cpu,
@ -635,7 +629,8 @@ public:
                    first_stage_model->get_param_tensors(tensors, "first_stage_model");
                }
            }
-            if (use_tiny_autoencoder || version == VERSION_SDXS) {
+
+            if (use_tiny_autoencoder) {
                if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
                    tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(vae_backend,
                                                                             offload_params_to_cpu,
@ -650,10 +645,6 @@ public:
                                                                             "decoder.layers",
                                                                             vae_decode_only,
                                                                             version);
-                    if (version == VERSION_SDXS) {
-                        tae_first_stage->alloc_params_buffer();
-                        tae_first_stage->get_param_tensors(tensors, "first_stage_model");
-                    }
                }
                if (sd_ctx_params->vae_conv_direct) {
                    LOG_INFO("Using Conv2d direct in the tae model");
@ -791,15 +782,14 @@ public:
                unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size();
            }
            size_t vae_params_mem_size = 0;
-            if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
+            if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
                vae_params_mem_size = first_stage_model->get_params_buffer_size();
            }
-            if (use_tiny_autoencoder || version == VERSION_SDXS) {
-                if (use_tiny_autoencoder && !tae_first_stage->load_from_file(taesd_path, n_threads)) {
+            if (use_tiny_autoencoder) {
+                if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {
                    return false;
                }
-                use_tiny_autoencoder = true;  // now the processing is identical for VERSION_SDXS
-                vae_params_mem_size  = tae_first_stage->get_params_buffer_size();
+                vae_params_mem_size = tae_first_stage->get_params_buffer_size();
            }
            size_t control_net_params_mem_size = 0;
            if (control_net) {
@ -955,7 +945,7 @@ public:
        }

        ggml_free(ctx);
-        use_tiny_autoencoder = use_tiny_autoencoder && !tae_preview_only;
+        use_tiny_autoencoder = use_tiny_autoencoder && !sd_ctx_params->tae_preview_only;
        return true;
    }

--- a/tae.hpp
+++ b/tae.hpp
@ -505,8 +505,7 @@ struct TinyAutoEncoder : public GGMLRunner {
                         struct ggml_tensor** output,
                         struct ggml_context* output_ctx = nullptr) = 0;

-    virtual bool load_from_file(const std::string& file_path, int n_threads)                                      = 0;
-    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
+    virtual bool load_from_file(const std::string& file_path, int n_threads) = 0;
 };

 struct TinyImageAutoEncoder : public TinyAutoEncoder {
@ -556,10 +555,6 @@ struct TinyImageAutoEncoder : public TinyAutoEncoder {
        return success;
    }

-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        taesd.get_param_tensors(tensors, prefix);
-    }
-
    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
        z                       = to_backend(z);
@ -629,10 +624,6 @@ struct TinyVideoAutoEncoder : public TinyAutoEncoder {
        return success;
    }

-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        taehv.get_param_tensors(tensors, prefix);
-    }
-
    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
        z                       = to_backend(z);
--- a/unet.hpp
+++ b/unet.hpp
@ -215,13 +215,10 @@ public:
        } else if (sd_version_is_unet_edit(version)) {
            in_channels = 8;
        }
-        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
+        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET) {
            num_res_blocks = 1;
            channel_mult   = {1, 2, 4};
            tiny_unet      = true;
-            if (version == VERSION_SDXS) {
-                attention_resolutions = {4, 2};  // here just like SDXL
-            }
        }

        // dims is always 2