feat: override text encoders for unet models (#682 )

fix: fix a few typos on cli help and error messages (#714 )
2025-12-13 05:48:56 +00:00 · 2025-07-04 22:19:47 +08:00 · 2025-07-04 22:15:41 +08:00
5 changed files with 33 additions and 16 deletions
--- a/denoiser.hpp
+++ b/denoiser.hpp
@ -181,7 +181,7 @@ struct AYSSchedule : SigmaSchedule {
            LOG_INFO("AYS using SVD noise levels");
            inputs = noise_levels[2];
        } else {
-            LOG_ERROR("Version not compatable with AYS scheduler");
+            LOG_ERROR("Version not compatible with AYS scheduler");
            return results;
        }

--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -60,6 +60,7 @@ const char* modes_str[] = {
    "edit",
    "convert",
 };
+#define SD_ALL_MODES_STR "txt2img, img2img, edit, convert"

 enum SDMode {
    TXT2IMG,
@ -199,14 +200,18 @@ void print_usage(int argc, const char* argv[]) {
    printf("\n");
    printf("arguments:\n");
    printf("  -h, --help                         show this help message and exit\n");
-    printf("  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)\n");
+    printf("  -M, --mode [MODE]                  run mode, one of:\n");
+    printf("                                     txt2img: generate an image from a text prompt (default)\n");
+    printf("                                     img2img: generate an image from a text prompt and an initial image (--init-img)\n");
+    printf("                                     edit:    modify an image (--ref-image) based on text instructions\n");
+    printf("                                     convert: convert a model file to gguf format, optionally with quantization\n");
    printf("  -t, --threads N                    number of threads to use during computation (default: -1)\n");
    printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
    printf("  -m, --model [MODEL]                path to full model\n");
    printf("  --diffusion-model                  path to the standalone diffusion model\n");
    printf("  --clip_l                           path to the clip-l text encoder\n");
    printf("  --clip_g                           path to the clip-g text encoder\n");
-    printf("  --t5xxl                            path to the the t5xxl text encoder\n");
+    printf("  --t5xxl                            path to the t5xxl text encoder\n");
    printf("  --vae [VAE]                        path to vae\n");
    printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
    printf("  --control-net [CONTROL_PATH]       path to control net model\n");
@ -222,7 +227,7 @@ void print_usage(int argc, const char* argv[]) {
    printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
    printf("  --mask [MASK]                      path to the mask image, required by img2img with mask\n");
    printf("  --control-image [IMAGE]            path to image condition, control net\n");
-    printf("  -r, --ref_image [PATH]             reference image for Flux Kontext models (can be used multiple times) \n");
+    printf("  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times) \n");
    printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
@ -291,8 +296,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
            }
            if (mode_found == -1) {
                fprintf(stderr,
-                        "error: invalid mode %s, must be one of [txt2img, img2img, img2vid, convert]\n",
-                        mode_selected);
+                        "error: invalid mode %s, must be one of [%s]\n",
+                        mode_selected, SD_ALL_MODES_STR);
                exit(1);
            }
            params.mode = (SDMode)mode_found;
@ -1218,4 +1223,4 @@ int main(int argc, const char* argv[]) {
    free(input_image_buffer);

    return 0;
-}
+}
--- a/model.cpp
+++ b/model.cpp
@ -1539,6 +1539,15 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
    return true;
 }

+bool ModelLoader::model_is_unet() {
+    for (auto& tensor_storage : tensor_storages) {
+        if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos) {
+            return true;
+        }
+    }
+    return false;
+}
+
 SDVersion ModelLoader::get_sd_version() {
    TensorStorage token_embedding_weight, input_block_weight;
    bool input_block_checked = false;
--- a/model.h
+++ b/model.h
@ -210,6 +210,7 @@ public:
    std::map<std::string, enum ggml_type> tensor_storages_types;

    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
+    bool model_is_unet();
    SDVersion get_sd_version();
    ggml_type get_sd_wtype();
    ggml_type get_conditioner_wtype();
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -213,16 +213,25 @@ public:
            }
        }

+        if (diffusion_model_path.size() > 0) {
+            LOG_INFO("loading diffusion model from '%s'", diffusion_model_path.c_str());
+            if (!model_loader.init_from_file(diffusion_model_path, "model.diffusion_model.")) {
+                LOG_WARN("loading diffusion model from '%s' failed", diffusion_model_path.c_str());
+            }
+        }
+
+        bool is_unet = model_loader.model_is_unet();
+
        if (clip_l_path.size() > 0) {
            LOG_INFO("loading clip_l from '%s'", clip_l_path.c_str());
-            if (!model_loader.init_from_file(clip_l_path, "text_encoders.clip_l.transformer.")) {
+            if (!model_loader.init_from_file(clip_l_path, is_unet ? "cond_stage_model.transformer." : "text_encoders.clip_l.transformer.")) {
                LOG_WARN("loading clip_l from '%s' failed", clip_l_path.c_str());
            }
        }

        if (clip_g_path.size() > 0) {
            LOG_INFO("loading clip_g from '%s'", clip_g_path.c_str());
-            if (!model_loader.init_from_file(clip_g_path, "text_encoders.clip_g.transformer.")) {
+            if (!model_loader.init_from_file(clip_g_path, is_unet ? "cond_stage_model.1.transformer." : "text_encoders.clip_g.transformer.")) {
                LOG_WARN("loading clip_g from '%s' failed", clip_g_path.c_str());
            }
        }
@ -234,13 +243,6 @@ public:
            }
        }

-        if (diffusion_model_path.size() > 0) {
-            LOG_INFO("loading diffusion model from '%s'", diffusion_model_path.c_str());
-            if (!model_loader.init_from_file(diffusion_model_path, "model.diffusion_model.")) {
-                LOG_WARN("loading diffusion model from '%s' failed", diffusion_model_path.c_str());
-            }
-        }
-
        if (vae_path.size() > 0) {
            LOG_INFO("loading vae from '%s'", vae_path.c_str());
            if (!model_loader.init_from_file(vae_path, "vae.")) {
Author	SHA1	Message	Date
stduhpf	19fbfd8639	feat: override text encoders for unet models (#682 )	2025-07-04 22:19:47 +08:00
Wagner Bruna	76c72628b1	fix: fix a few typos on cli help and error messages (#714 )	2025-07-04 22:15:41 +08:00