feat: align the spatial size to the corresponding multiple (#1073 )

feat: support custom upscale tile size (#896 )
refactor: optimize the handling of LoRA models (#1070 )
2025-12-12 13:28:37 +00:00 · 2025-12-10 23:15:08 +08:00 · 2025-12-10 22:25:19 +08:00 · 2025-12-10 00:26:07 +08:00 · 2025-12-09 22:38:54 +08:00 · 2025-12-09 22:06:16 +08:00
13 changed files with 398 additions and 139 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -87,6 +87,38 @@ file(GLOB SD_LIB_SOURCES
    "*.hpp"
 )

+find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
+if(GIT_EXE)
+    execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE SDCPP_BUILD_VERSION
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+endif()
+
+if(NOT SDCPP_BUILD_VERSION)
+    set(SDCPP_BUILD_VERSION unknown)
+endif()
+message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
+
+if(NOT SDCPP_BUILD_COMMIT)
+    set(SDCPP_BUILD_COMMIT unknown)
+endif()
+message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
+
+set_property(
+  SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp
+  APPEND PROPERTY COMPILE_DEFINITIONS
+  SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
+)
+
 if(SD_BUILD_SHARED_LIBS)
    message("-- Build shared library")
    message(${SD_LIB_SOURCES})
--- a/README.md
+++ b/README.md
@ -105,7 +105,7 @@ API and command-line option may change frequently.***
 ### Download model weights

 - download weights(.ckpt or .safetensors or .gguf). For example
-    - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
+    - Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5 

    ```sh
    curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
--- a/clip.hpp
+++ b/clip.hpp
@ -7,31 +7,6 @@

 /*================================================== CLIPTokenizer ===================================================*/

-__STATIC_INLINE__ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
-    std::regex re("<lora:([^:]+):([^>]+)>");
-    std::smatch matches;
-    std::unordered_map<std::string, float> filename2multiplier;
-
-    while (std::regex_search(text, matches, re)) {
-        std::string filename = matches[1].str();
-        float multiplier     = std::stof(matches[2].str());
-
-        text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
-
-        if (multiplier == 0.f) {
-            continue;
-        }
-
-        if (filename2multiplier.find(filename) == filename2multiplier.end()) {
-            filename2multiplier[filename] = multiplier;
-        } else {
-            filename2multiplier[filename] += multiplier;
-        }
-    }
-
-    return std::make_pair(filename2multiplier, text);
-}
-
 __STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
    std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
    std::set<int> byte_set;
--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -156,9 +156,10 @@ struct ESRGAN : public GGMLRunner {

    ESRGAN(ggml_backend_t backend,
           bool offload_params_to_cpu,
+           int tile_size                                  = 128,
           const String2TensorStorage& tensor_storage_map = {})
        : GGMLRunner(backend, offload_params_to_cpu) {
-        // rrdb_net will be created in load_from_file
+        this->tile_size = tile_size;
    }

    std::string get_desc() override {
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -324,6 +324,7 @@ struct SDCliParams {
    std::string output_path = "output.png";

    bool verbose          = false;
+    bool version          = false;
    bool canny_preprocess = false;

    preview_t preview_method = PREVIEW_NONE;
@ -366,6 +367,10 @@ struct SDCliParams {
             "--verbose",
             "print extra info",
             true, &verbose},
+            {"",
+             "--version",
+             "print stable-diffusion.cpp version",
+             true, &version},
            {"",
             "--color",
             "colors the logging tags according to level",
@ -502,7 +507,7 @@ struct SDContextParams {
    std::string lora_model_dir;

    std::map<std::string, std::string> embedding_map;
-    std::vector<sd_embedding_t> embedding_array;
+    std::vector<sd_embedding_t> embedding_vec;

    rng_type_t rng_type         = CUDA_RNG;
    rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
@ -947,13 +952,13 @@ struct SDContextParams {
    }

    sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) {
-        embedding_array.clear();
-        embedding_array.reserve(embedding_map.size());
+        embedding_vec.clear();
+        embedding_vec.reserve(embedding_map.size());
        for (const auto& kv : embedding_map) {
            sd_embedding_t item;
            item.name = kv.first.c_str();
            item.path = kv.second.c_str();
-            embedding_array.emplace_back(item);
+            embedding_vec.emplace_back(item);
        }

        sd_ctx_params_t sd_ctx_params = {
@ -970,8 +975,8 @@ struct SDContextParams {
            taesd_path.c_str(),
            control_net_path.c_str(),
            lora_model_dir.c_str(),
-            embedding_array.data(),
-            static_cast<uint32_t>(embedding_array.size()),
+            embedding_vec.data(),
+            static_cast<uint32_t>(embedding_vec.size()),
            photo_maker_path.c_str(),
            tensor_type_rules.c_str(),
            vae_decode_only,
@ -1025,6 +1030,15 @@ static std::string vec_str_to_string(const std::vector<std::string>& v) {
    return oss.str();
 }

+static bool is_absolute_path(const std::string& p) {
+#ifdef _WIN32
+    // Windows: C:/path or C:\path
+    return p.size() > 1 && std::isalpha(static_cast<unsigned char>(p[0])) && p[1] == ':';
+#else
+    return !p.empty() && p[0] == '/';
+#endif
+}
+
 struct SDGenerationParams {
    std::string prompt;
    std::string negative_prompt;
@ -1065,7 +1079,12 @@ struct SDGenerationParams {
    std::string pm_id_embed_path;
    float pm_style_strength = 20.f;

-    int upscale_repeats = 1;
+    int upscale_repeats   = 1;
+    int upscale_tile_size = 128;
+
+    std::map<std::string, float> lora_map;
+    std::map<std::string, float> high_noise_lora_map;
+    std::vector<sd_lora_t> lora_vec;

    SDGenerationParams() {
        sd_sample_params_init(&sample_params);
@ -1158,6 +1177,10 @@ struct SDGenerationParams {
             "--upscale-repeats",
             "Run the ESRGAN upscaler this many times (default: 1)",
             &upscale_repeats},
+            {"",
+             "--upscale-tile-size",
+             "tile size for ESRGAN upscaling (default: 128)",
+             &upscale_tile_size},
        };

        options.float_options = {
@ -1437,7 +1460,88 @@ struct SDGenerationParams {
        return options;
    }

-    bool process_and_check(SDMode mode) {
+    void extract_and_remove_lora(const std::string& lora_model_dir) {
+        static const std::regex re(R"(<lora:([^:>]+):([^>]+)>)");
+        static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"};
+        std::smatch m;
+
+        std::string tmp = prompt;
+
+        while (std::regex_search(tmp, m, re)) {
+            std::string raw_path      = m[1].str();
+            const std::string raw_mul = m[2].str();
+
+            float mul = 0.f;
+            try {
+                mul = std::stof(raw_mul);
+            } catch (...) {
+                tmp    = m.suffix().str();
+                prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
+                continue;
+            }
+
+            bool is_high_noise              = false;
+            static const std::string prefix = "|high_noise|";
+            if (raw_path.rfind(prefix, 0) == 0) {
+                raw_path.erase(0, prefix.size());
+                is_high_noise = true;
+            }
+
+            fs::path final_path;
+            if (is_absolute_path(raw_path)) {
+                final_path = raw_path;
+            } else {
+                final_path = fs::path(lora_model_dir) / raw_path;
+            }
+            if (!fs::exists(final_path)) {
+                bool found = false;
+                for (const auto& ext : valid_ext) {
+                    fs::path try_path = final_path;
+                    try_path += ext;
+                    if (fs::exists(try_path)) {
+                        final_path = try_path;
+                        found      = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    printf("can not found lora %s\n", final_path.lexically_normal().string().c_str());
+                    tmp    = m.suffix().str();
+                    prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
+                    continue;
+                }
+            }
+
+            const std::string key = final_path.lexically_normal().string();
+
+            if (is_high_noise)
+                high_noise_lora_map[key] += mul;
+            else
+                lora_map[key] += mul;
+
+            prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
+
+            tmp = m.suffix().str();
+        }
+
+        for (const auto& kv : lora_map) {
+            sd_lora_t item;
+            item.is_high_noise = false;
+            item.path          = kv.first.c_str();
+            item.multiplier    = kv.second;
+            lora_vec.emplace_back(item);
+        }
+
+        for (const auto& kv : high_noise_lora_map) {
+            sd_lora_t item;
+            item.is_high_noise = true;
+            item.path          = kv.first.c_str();
+            item.multiplier    = kv.second;
+            lora_vec.emplace_back(item);
+        }
+    }
+
+    bool process_and_check(SDMode mode, const std::string& lora_model_dir) {
        if (width <= 0) {
            fprintf(stderr, "error: the width must be greater than 0\n");
            return false;
@ -1536,6 +1640,10 @@ struct SDGenerationParams {
            return false;
        }

+        if (upscale_tile_size < 1) {
+            return false;
+        }
+
        if (mode == UPSCALE) {
            if (init_image_path.length() == 0) {
                fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n");
@ -1548,14 +1656,44 @@ struct SDGenerationParams {
            seed = rand();
        }

+        extract_and_remove_lora(lora_model_dir);
+
        return true;
    }

    std::string to_string() const {
        char* sample_params_str            = sd_sample_params_to_str(&sample_params);
        char* high_noise_sample_params_str = sd_sample_params_to_str(&high_noise_sample_params);
+
+        std::ostringstream lora_ss;
+        lora_ss << "{\n";
+        for (auto it = lora_map.begin(); it != lora_map.end(); ++it) {
+            lora_ss << "    \"" << it->first << "\": \"" << it->second << "\"";
+            if (std::next(it) != lora_map.end()) {
+                lora_ss << ",";
+            }
+            lora_ss << "\n";
+        }
+        lora_ss << "  }";
+        std::string loras_str = lora_ss.str();
+
+        lora_ss = std::ostringstream();
+        ;
+        lora_ss << "{\n";
+        for (auto it = high_noise_lora_map.begin(); it != high_noise_lora_map.end(); ++it) {
+            lora_ss << "    \"" << it->first << "\": \"" << it->second << "\"";
+            if (std::next(it) != high_noise_lora_map.end()) {
+                lora_ss << ",";
+            }
+            lora_ss << "\n";
+        }
+        lora_ss << "  }";
+        std::string high_noise_loras_str = lora_ss.str();
+
        std::ostringstream oss;
        oss << "SDGenerationParams {\n"
+            << "  loras: \"" << loras_str << "\",\n"
+            << "  high_noise_loras: \"" << high_noise_loras_str << "\",\n"
            << "  prompt: \"" << prompt << "\",\n"
            << "  negative_prompt: \"" << negative_prompt << "\",\n"
            << "  clip_skip: " << clip_skip << ",\n"
@ -1591,6 +1729,7 @@ struct SDGenerationParams {
            << "  control_strength: " << control_strength << ",\n"
            << "  seed: " << seed << ",\n"
            << "  upscale_repeats: " << upscale_repeats << ",\n"
+            << "  upscale_tile_size: " << upscale_tile_size << ",\n"
            << "}";
        free(sample_params_str);
        free(high_noise_sample_params_str);
@ -1598,7 +1737,12 @@ struct SDGenerationParams {
    }
 };

+static std::string version_string() {
+    return std::string("stable-diffusion.cpp version ") + sd_version() + ", commit " + sd_commit();
+}
+
 void print_usage(int argc, const char* argv[], const std::vector<ArgOptions>& options_list) {
+    std::cout << version_string() << "\n";
    std::cout << "Usage: " << argv[0] << " [options]\n\n";
    std::cout << "CLI Options:\n";
    options_list[0].print();
@ -1616,7 +1760,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
        exit(cli_params.normal_exit ? 0 : 1);
    }

-    if (!cli_params.process_and_check() || !ctx_params.process_and_check(cli_params.mode) || !gen_params.process_and_check(cli_params.mode)) {
+    if (!cli_params.process_and_check() ||
+        !ctx_params.process_and_check(cli_params.mode) ||
+        !gen_params.process_and_check(cli_params.mode, ctx_params.lora_model_dir)) {
        print_usage(argc, argv, options_vec);
        exit(1);
    }
@ -1881,11 +2027,19 @@ void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy,
 }

 int main(int argc, const char* argv[]) {
+    if (argc > 1 && std::string(argv[1]) == "--version") {
+        std::cout << version_string() << "\n";
+        return EXIT_SUCCESS;
+    }
+
    SDCliParams cli_params;
    SDContextParams ctx_params;
    SDGenerationParams gen_params;

    parse_args(argc, argv, cli_params, ctx_params, gen_params);
+    if (cli_params.verbose || cli_params.version) {
+        std::cout << version_string() << "\n";
+    }
    if (gen_params.video_frames > 4) {
        size_t last_dot_pos   = cli_params.preview_path.find_last_of(".");
        std::string base_path = cli_params.preview_path;
@ -2121,6 +2275,8 @@ int main(int argc, const char* argv[]) {

        if (cli_params.mode == IMG_GEN) {
            sd_img_gen_params_t img_gen_params = {
+                gen_params.lora_vec.data(),
+                static_cast<uint32_t>(gen_params.lora_vec.size()),
                gen_params.prompt.c_str(),
                gen_params.negative_prompt.c_str(),
                gen_params.clip_skip,
@ -2152,6 +2308,8 @@ int main(int argc, const char* argv[]) {
            num_results = gen_params.batch_count;
        } else if (cli_params.mode == VID_GEN) {
            sd_vid_gen_params_t vid_gen_params = {
+                gen_params.lora_vec.data(),
+                static_cast<uint32_t>(gen_params.lora_vec.size()),
                gen_params.prompt.c_str(),
                gen_params.negative_prompt.c_str(),
                gen_params.clip_skip,
@ -2188,7 +2346,8 @@ int main(int argc, const char* argv[]) {
        upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
                                                        ctx_params.offload_params_to_cpu,
                                                        ctx_params.diffusion_conv_direct,
-                                                        ctx_params.n_threads);
+                                                        ctx_params.n_threads,
+                                                        gen_params.upscale_tile_size);

        if (upscaler_ctx == nullptr) {
            printf("new_upscaler_ctx failed\n");
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -60,6 +60,14 @@
 #define SD_UNUSED(x) (void)(x)
 #endif

+__STATIC_INLINE__ int align_up_offset(int n, int multiple) {
+    return (multiple - n % multiple) % multiple;
+}
+
+__STATIC_INLINE__ int align_up(int n, int multiple) {
+    return n + align_up_offset(n, multiple);
+}
+
 __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
    switch (level) {
        case GGML_LOG_LEVEL_DEBUG:
--- a/latent-preview.h
+++ b/latent-preview.h
@ -91,6 +91,41 @@ const float flux_latent_rgb_proj[16][3] = {
    {-0.111849f, -0.055589f, -0.032361f}};
 float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};

+const float flux2_latent_rgb_proj[32][3] = {
+    {0.000736f, -0.008385f, -0.019710f},
+    {-0.001352f, -0.016392f, 0.020693f},
+    {-0.006376f, 0.002428f, 0.036736f},
+    {0.039384f, 0.074167f, 0.119789f},
+    {0.007464f, -0.005705f, -0.004734f},
+    {-0.004086f, 0.005287f, -0.000409f},
+    {-0.032835f, 0.050802f, -0.028120f},
+    {-0.003158f, -0.000835f, 0.000406f},
+    {-0.112840f, -0.084337f, -0.023083f},
+    {0.001462f, -0.006656f, 0.000549f},
+    {-0.009980f, -0.007480f, 0.009702f},
+    {0.032540f, 0.000214f, -0.061388f},
+    {0.011023f, 0.000694f, 0.007143f},
+    {-0.001468f, -0.006723f, -0.001678f},
+    {-0.005921f, -0.010320f, -0.003907f},
+    {-0.028434f, 0.027584f, 0.018457f},
+    {0.014349f, 0.011523f, 0.000441f},
+    {0.009874f, 0.003081f, 0.001507f},
+    {0.002218f, 0.005712f, 0.001563f},
+    {0.053010f, -0.019844f, 0.008683f},
+    {-0.002507f, 0.005384f, 0.000938f},
+    {-0.002177f, -0.011366f, 0.003559f},
+    {-0.000261f, 0.015121f, -0.003240f},
+    {-0.003944f, -0.002083f, 0.005043f},
+    {-0.009138f, 0.011336f, 0.003781f},
+    {0.011429f, 0.003985f, -0.003855f},
+    {0.010518f, -0.005586f, 0.010131f},
+    {0.007883f, 0.002912f, -0.001473f},
+    {-0.003318f, -0.003160f, 0.003684f},
+    {-0.034560f, -0.008740f, 0.012996f},
+    {0.000166f, 0.001079f, -0.012153f},
+    {0.017772f, 0.000937f, -0.011953f}};
+float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
+
 // This one was taken straight from
 // https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
 // (MiT Licence)
@ -128,16 +163,42 @@ const float sd_latent_rgb_proj[4][3] = {
    {-0.178022f, -0.200862f, -0.678514f}};
 float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};

-void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
+void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
    size_t buffer_head = 0;
+
+    uint32_t latent_width  = latents->ne[0];
+    uint32_t latent_height = latents->ne[1];
+    uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
+    uint32_t frames        = 1;
+    if (ggml_n_dims(latents) == 4) {
+        frames = latents->ne[2];
+    }
+
+    uint32_t rgb_width  = latent_width * patch_size;
+    uint32_t rgb_height = latent_height * patch_size;
+
+    uint32_t unpatched_dim = dim / (patch_size * patch_size);
+
    for (int k = 0; k < frames; k++) {
-        for (int j = 0; j < height; j++) {
-            for (int i = 0; i < width; i++) {
-                size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
+        for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
+            for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
+                int latent_x = rgb_x / patch_size;
+                int latent_y = rgb_y / patch_size;
+
+                int channel_offset = 0;
+                if (patch_size > 1) {
+                    channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
+                }
+
+                size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]);
+
+                // should be incremented by 1 for each pixel
+                size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
+
                float r = 0, g = 0, b = 0;
                if (latent_rgb_proj != nullptr) {
-                    for (int d = 0; d < dim; d++) {
-                        float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
+                    for (int d = 0; d < unpatched_dim; d++) {
+                        float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]);
                        r += value * latent_rgb_proj[d][0];
                        g += value * latent_rgb_proj[d][1];
                        b += value * latent_rgb_proj[d][2];
@ -164,9 +225,9 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl
                g = g >= 0 ? g <= 1 ? g : 1 : 0;
                b = b >= 0 ? b <= 1 ? b : 1 : 0;

-                buffer[buffer_head++] = (uint8_t)(r * 255);
-                buffer[buffer_head++] = (uint8_t)(g * 255);
-                buffer[buffer_head++] = (uint8_t)(b * 255);
+                buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
+                buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
+                buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
            }
        }
    }
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -937,28 +937,17 @@ public:
                                                         float multiplier,
                                                         ggml_backend_t backend,
                                                         LoraModel::filter_t lora_tensor_filter = nullptr) {
-        std::string lora_name      = lora_id;
-        std::string high_noise_tag = "|high_noise|";
-        bool is_high_noise         = false;
-        if (starts_with(lora_name, high_noise_tag)) {
-            lora_name     = lora_name.substr(high_noise_tag.size());
+        std::string lora_path             = lora_id;
+        static std::string high_noise_tag = "|high_noise|";
+        bool is_high_noise                = false;
+        if (starts_with(lora_path, high_noise_tag)) {
+            lora_path     = lora_path.substr(high_noise_tag.size());
            is_high_noise = true;
-            LOG_DEBUG("high noise lora: %s", lora_name.c_str());
+            LOG_DEBUG("high noise lora: %s", lora_path.c_str());
        }
-        std::string st_file_path   = path_join(lora_model_dir, lora_name + ".safetensors");
-        std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
-        std::string file_path;
-        if (file_exists(st_file_path)) {
-            file_path = st_file_path;
-        } else if (file_exists(ckpt_file_path)) {
-            file_path = ckpt_file_path;
-        } else {
-            LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
-            return nullptr;
-        }
-        auto lora = std::make_shared<LoraModel>(lora_id, backend, file_path, is_high_noise ? "model.high_noise_" : "", version);
+        auto lora = std::make_shared<LoraModel>(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version);
        if (!lora->load_from_file(n_threads, lora_tensor_filter)) {
-            LOG_WARN("load lora tensors from %s failed", file_path.c_str());
+            LOG_WARN("load lora tensors from %s failed", lora_path.c_str());
            return nullptr;
        }

@ -1143,12 +1132,15 @@ public:
        }
    }

-    std::string apply_loras_from_prompt(const std::string& prompt) {
-        auto result_pair                                = extract_and_remove_lora(prompt);
-        std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
-
-        for (auto& kv : lora_f2m) {
-            LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
+    void apply_loras(const sd_lora_t* loras, uint32_t lora_count) {
+        std::unordered_map<std::string, float> lora_f2m;
+        for (int i = 0; i < lora_count; i++) {
+            std::string lora_id = SAFE_STR(loras[i].path);
+            if (loras[i].is_high_noise) {
+                lora_id = "|high_noise|" + lora_id;
+            }
+            lora_f2m[lora_id] = loras[i].multiplier;
+            LOG_DEBUG("lora %s:%.2f", lora_id.c_str(), loras[i].multiplier);
        }
        int64_t t0 = ggml_time_ms();
        if (apply_lora_immediately) {
@ -1159,9 +1151,7 @@ public:
        int64_t t1 = ggml_time_ms();
        if (!lora_f2m.empty()) {
            LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-            LOG_DEBUG("prompt after extract and remove lora: \"%s\"", result_pair.second.c_str());
        }
-        return result_pair.second;
    }

    ggml_tensor* id_encoder(ggml_context* work_ctx,
@ -1326,10 +1316,17 @@ public:
        uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];

        if (preview_mode == PREVIEW_PROJ) {
+            int64_t patch_sz                       = 1;
            const float(*latent_rgb_proj)[channel] = nullptr;
            float* latent_rgb_bias                 = nullptr;

-            if (dim == 48) {
+            if (dim == 128) {
+                if (sd_version_is_flux2(version)) {
+                    latent_rgb_proj = flux2_latent_rgb_proj;
+                    latent_rgb_bias = flux2_latent_rgb_bias;
+                    patch_sz        = 2;
+                }
+            } else if (dim == 48) {
                if (sd_version_is_wan(version)) {
                    latent_rgb_proj = wan_22_latent_rgb_proj;
                    latent_rgb_bias = wan_22_latent_rgb_bias;
@ -1382,12 +1379,15 @@ public:
                frames = latents->ne[2];
            }

-            uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t));
+            uint32_t img_width  = width * patch_sz;
+            uint32_t img_height = height * patch_sz;

-            preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, width, height, frames, dim);
+            uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t));
+
+            preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz);
            sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
            for (int i = 0; i < frames; i++) {
-                images[i] = {width, height, channel, data + i * width * height * channel};
+                images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel};
            }
            step_callback(step, frames, images, is_noisy, step_callback_data);
            free(data);
@ -1898,6 +1898,18 @@ public:
        return vae_scale_factor;
    }

+    int get_diffusion_model_down_factor() {
+        int down_factor = 8;  // unet
+        if (sd_version_is_dit(version)) {
+            if (sd_version_is_wan(version)) {
+                down_factor = 2;
+            } else {
+                down_factor = 1;
+            }
+        }
+        return down_factor;
+    }
+
    int get_latent_channel() {
        int latent_channel = 4;
        if (sd_version_is_dit(version)) {
@ -2805,8 +2817,6 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
    int sample_steps = sigmas.size() - 1;

    int64_t t0 = ggml_time_ms();
-    // Apply lora
-    prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);

    // Photo Maker
    std::string prompt_text_only;
@ -3135,22 +3145,19 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
    sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
    int width                     = sd_img_gen_params->width;
    int height                    = sd_img_gen_params->height;
-    int vae_scale_factor          = sd_ctx->sd->get_vae_scale_factor();
-    if (sd_version_is_dit(sd_ctx->sd->version)) {
-        if (width % 16 || height % 16) {
-            LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)",
-                      model_version_to_str[sd_ctx->sd->version],
-                      width,
-                      height);
-            return nullptr;
-        }
-    } else if (width % 64 || height % 64) {
-        LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)",
-                  model_version_to_str[sd_ctx->sd->version],
-                  width,
-                  height);
-        return nullptr;
+
+    int vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
+    int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
+    int spatial_multiple            = vae_scale_factor * diffusion_model_down_factor;
+
+    int width_offset  = align_up_offset(width, spatial_multiple);
+    int height_offset = align_up_offset(height, spatial_multiple);
+    if (width_offset > 0 || height_offset > 0) {
+        width += width_offset;
+        height += height_offset;
+        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple);
    }
+
    LOG_DEBUG("generate_image %dx%d", width, height);
    if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
        return nullptr;
@ -3178,6 +3185,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g

    size_t t0 = ggml_time_ms();

+    // Apply lora
+    sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count);
+
    enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method;
    if (sample_method == SAMPLE_METHOD_COUNT) {
        sample_method = sd_get_default_sample_method(sd_ctx);
@ -3421,9 +3431,19 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
    int frames       = sd_vid_gen_params->video_frames;
    frames           = (frames - 1) / 4 * 4 + 1;
    int sample_steps = sd_vid_gen_params->sample_params.sample_steps;
-    LOG_INFO("generate_video %dx%dx%d", width, height, frames);

-    int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
+    int vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
+    int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
+    int spatial_multiple            = vae_scale_factor * diffusion_model_down_factor;
+
+    int width_offset  = align_up_offset(width, spatial_multiple);
+    int height_offset = align_up_offset(height, spatial_multiple);
+    if (width_offset > 0 || height_offset > 0) {
+        width += width_offset;
+        height += height_offset;
+        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_vid_gen_params->width, sd_vid_gen_params->height, width, height, spatial_multiple);
+    }
+    LOG_INFO("generate_video %dx%dx%d", width, height, frames);

    enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method;
    if (sample_method == SAMPLE_METHOD_COUNT) {
@ -3477,7 +3497,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
    int64_t t0 = ggml_time_ms();

    // Apply lora
-    prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
+    sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count);

    ggml_tensor* init_latent        = nullptr;
    ggml_tensor* clip_vision_output = nullptr;
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -242,6 +242,14 @@ typedef struct {
 } sd_easycache_params_t;

 typedef struct {
+    bool is_high_noise;
+    float multiplier;
+    const char* path;
+} sd_lora_t;
+
+typedef struct {
+    const sd_lora_t* loras;
+    uint32_t lora_count;
    const char* prompt;
    const char* negative_prompt;
    int clip_skip;
@ -265,6 +273,8 @@ typedef struct {
 } sd_img_gen_params_t;

 typedef struct {
+    const sd_lora_t* loras;
+    uint32_t lora_count;
    const char* prompt;
    const char* negative_prompt;
    int clip_skip;
@ -337,7 +347,8 @@ typedef struct upscaler_ctx_t upscaler_ctx_t;
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
                                        bool offload_params_to_cpu,
                                        bool direct,
-                                        int n_threads);
+                                        int n_threads,
+                                        int tile_size);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);

 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
@ -359,6 +370,9 @@ SD_API bool preprocess_canny(sd_image_t image,
                             float strong,
                             bool inverse);

+SD_API const char* sd_commit(void);
+SD_API const char* sd_version(void);
+
 #ifdef __cplusplus
 }
 #endif
--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -9,12 +9,15 @@ struct UpscalerGGML {
    std::shared_ptr<ESRGAN> esrgan_upscaler;
    std::string esrgan_path;
    int n_threads;
-    bool direct = false;
+    bool direct   = false;
+    int tile_size = 128;

    UpscalerGGML(int n_threads,
-                 bool direct = false)
+                 bool direct   = false,
+                 int tile_size = 128)
        : n_threads(n_threads),
-          direct(direct) {
+          direct(direct),
+          tile_size(tile_size) {
    }

    bool load_from_file(const std::string& esrgan_path,
@ -51,7 +54,7 @@ struct UpscalerGGML {
            backend = ggml_backend_cpu_init();
        }
        LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
-        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.get_tensor_storage_map());
+        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
        if (direct) {
            esrgan_upscaler->set_conv2d_direct_enabled(true);
        }
@ -113,14 +116,15 @@ struct upscaler_ctx_t {
 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
                                 bool offload_params_to_cpu,
                                 bool direct,
-                                 int n_threads) {
+                                 int n_threads,
+                                 int tile_size) {
    upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
    if (upscaler_ctx == nullptr) {
        return nullptr;
    }
    std::string esrgan_path(esrgan_path_c_str);

-    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
+    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size);
    if (upscaler_ctx->upscaler == nullptr) {
        return nullptr;
    }
--- a/util.cpp
+++ b/util.cpp
@ -95,20 +95,6 @@ bool is_directory(const std::string& path) {
    return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
 }

-std::string get_full_path(const std::string& dir, const std::string& filename) {
-    std::string full_path = dir + "\\" + filename;
-
-    WIN32_FIND_DATA find_file_data;
-    HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data);
-
-    if (hFind != INVALID_HANDLE_VALUE) {
-        FindClose(hFind);
-        return full_path;
-    } else {
-        return "";
-    }
-}
-
 #else  // Unix
 #include <dirent.h>
 #include <sys/stat.h>
@ -123,26 +109,6 @@ bool is_directory(const std::string& path) {
    return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
 }

-// TODO: add windows version
-std::string get_full_path(const std::string& dir, const std::string& filename) {
-    DIR* dp = opendir(dir.c_str());
-
-    if (dp != nullptr) {
-        struct dirent* entry;
-
-        while ((entry = readdir(dp)) != nullptr) {
-            if (strcasecmp(entry->d_name, filename.c_str()) == 0) {
-                closedir(dp);
-                return dir + "/" + entry->d_name;
-            }
-        }
-
-        closedir(dp);
-    }
-
-    return "";
-}
-
 #endif

 // get_num_physical_cores is copy from
--- a/util.h
+++ b/util.h
@ -22,7 +22,6 @@ int round_up_to(int value, int base);

 bool file_exists(const std::string& filename);
 bool is_directory(const std::string& path);
-std::string get_full_path(const std::string& dir, const std::string& filename);

 std::u32string utf8_to_utf32(const std::string& utf8_str);
 std::string utf32_to_utf8(const std::u32string& utf32_str);
--- a/version.cpp
+++ b/version.cpp
@ -0,0 +1,20 @@
+#include "stable-diffusion.h"
+
+#ifndef SDCPP_BUILD_COMMIT
+#define SDCPP_BUILD_COMMIT unknown
+#endif
+
+#ifndef SDCPP_BUILD_VERSION
+#define SDCPP_BUILD_VERSION unknown
+#endif
+
+#define STRINGIZE2(x) #x
+#define STRINGIZE(x) STRINGIZE2(x)
+
+const char* sd_commit(void) {
+    return STRINGIZE(SDCPP_BUILD_COMMIT);
+}
+
+const char* sd_version(void) {
+    return STRINGIZE(SDCPP_BUILD_VERSION);
+}
Author	SHA1	Message	Date
leejet	8823dc48bc	feat: align the spatial size to the corresponding multiple (#1073 )	2025-12-10 23:15:08 +08:00
Pedrito	1ac5a616de	feat: support custom upscale tile size (#896 )	2025-12-10 22:25:19 +08:00
leejet	d939f6e86a	refactor: optimize the handling of LoRA models (#1070 )	2025-12-10 00:26:07 +08:00
Wagner Bruna	e72aea796e	feat: embed version string and git commit hash (#1008 )	2025-12-09 22:38:54 +08:00
wuhei	a908436729	docs: update download link for Stable Diffusion v1.5 (#1063 )	2025-12-09 22:06:16 +08:00
stduhpf	583a02e29e	feat: add Flux.2 VAE proj matrix for previews (#1017 )	2025-12-09 22:00:45 +08:00